summaryrefslogtreecommitdiff
path: root/tools/perf/scripts/python
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/scripts/python')
0 files changed, 0 insertions, 0 deletions
ig.preempt?id2=3f1271b54edcc692da5a3663f2aa2a64781f9bc3'>kernel/Kconfig.preempt68
-rw-r--r--kernel/Makefile98
-rw-r--r--kernel/acct.c263
-rw-r--r--kernel/async.c108
-rw-r--r--kernel/audit.c436
-rw-r--r--kernel/audit.h36
-rw-r--r--kernel/audit_fsnotify.c15
-rw-r--r--kernel/audit_tree.c111
-rw-r--r--kernel/audit_watch.c22
-rw-r--r--kernel/auditfilter.c57
-rw-r--r--kernel/auditsc.c433
-rw-r--r--kernel/backtracetest.c19
-rw-r--r--kernel/bounds.c10
-rw-r--r--kernel/bpf/Kconfig10
-rw-r--r--kernel/bpf/Makefile37
-rw-r--r--kernel/bpf/arena.c665
-rw-r--r--kernel/bpf/arraymap.c399
-rw-r--r--kernel/bpf/bloom_filter.c65
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c238
-rw-r--r--kernel/bpf/bpf_inode_storage.c118
-rw-r--r--kernel/bpf/bpf_insn_array.c304
-rw-r--r--kernel/bpf/bpf_iter.c200
-rw-r--r--kernel/bpf/bpf_local_storage.c584
-rw-r--r--kernel/bpf/bpf_lru_list.c40
-rw-r--r--kernel/bpf/bpf_lru_list.h10
-rw-r--r--kernel/bpf/bpf_lsm.c250
-rw-r--r--kernel/bpf/bpf_struct_ops.c1260
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h12
-rw-r--r--kernel/bpf/bpf_task_storage.c215
-rw-r--r--kernel/bpf/btf.c4211
-rw-r--r--kernel/bpf/btf_iter.c2
-rw-r--r--kernel/bpf/btf_relocate.c2
-rw-r--r--kernel/bpf/cgroup.c1218
-rw-r--r--kernel/bpf/cgroup_iter.c359
-rw-r--r--kernel/bpf/core.c1296
-rw-r--r--kernel/bpf/cpumap.c393
-rw-r--r--kernel/bpf/cpumask.c534
-rw-r--r--kernel/bpf/crypto.c393
-rw-r--r--kernel/bpf/devmap.c170
-rw-r--r--kernel/bpf/disasm.c103
-rw-r--r--kernel/bpf/dispatcher.c52
-rw-r--r--kernel/bpf/dmabuf_iter.c150
-rw-r--r--kernel/bpf/hashtab.c916
-rw-r--r--kernel/bpf/helpers.c3575
-rw-r--r--kernel/bpf/inode.c434
-rw-r--r--kernel/bpf/kmem_cache_iter.c238
-rw-r--r--kernel/bpf/link_iter.c106
-rw-r--r--kernel/bpf/liveness.c753
-rw-r--r--kernel/bpf/local_storage.c48
-rw-r--r--kernel/bpf/log.c865
-rw-r--r--kernel/bpf/lpm_trie.c204
-rw-r--r--kernel/bpf/map_in_map.c78
-rw-r--r--kernel/bpf/map_in_map.h2
-rw-r--r--kernel/bpf/map_iter.c40
-rw-r--r--kernel/bpf/memalloc.c1016
-rw-r--r--kernel/bpf/mprog.c452
-rw-r--r--kernel/bpf/net_namespace.c10
-rw-r--r--kernel/bpf/offload.c456
-rw-r--r--kernel/bpf/percpu_freelist.c154
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/preload/Kconfig12
-rw-r--r--kernel/bpf/preload/Makefile41
-rw-r--r--kernel/bpf/preload/bpf_preload.h8
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c132
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/Makefile24
-rw-r--r--kernel/bpf/preload/iterators/README5
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c9
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h437
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h435
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/prog_iter.c3
-rw-r--r--kernel/bpf/queue_stack_maps.c68
-rw-r--r--kernel/bpf/range_tree.c261
-rw-r--r--kernel/bpf/range_tree.h21
-rw-r--r--kernel/bpf/relo_core.c2
-rw-r--r--kernel/bpf/reuseport_array.c32
-rw-r--r--kernel/bpf/ringbuf.c534
-rw-r--r--kernel/bpf/rqspinlock.c762
-rw-r--r--kernel/bpf/rqspinlock.h48
-rw-r--r--kernel/bpf/stackmap.c306
-rw-r--r--kernel/bpf/stream.c384
-rw-r--r--kernel/bpf/syscall.c3106
-rw-r--r--kernel/bpf/sysfs_btf.c44
-rw-r--r--kernel/bpf/task_iter.c500
-rw-r--r--kernel/bpf/tcx.c346
-rw-r--r--kernel/bpf/tnum.c81
-rw-r--r--kernel/bpf/token.c261
-rw-r--r--kernel/bpf/trampoline.c936
-rw-r--r--kernel/bpf/verifier.c18869
-rw-r--r--kernel/capability.c143
-rw-r--r--kernel/cfi.c352
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h28
-rw-r--r--kernel/cgroup/cgroup-v1.c152
-rw-r--r--kernel/cgroup/cgroup.c1713
-rw-r--r--kernel/cgroup/cpuset-internal.h308
-rw-r--r--kernel/cgroup/cpuset-v1.c607
-rw-r--r--kernel/cgroup/cpuset.c4239
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/dmem.c830
-rw-r--r--kernel/cgroup/freezer.c115
-rw-r--r--kernel/cgroup/legacy_freezer.c44
-rw-r--r--kernel/cgroup/misc.c148
-rw-r--r--kernel/cgroup/namespace.c35
-rw-r--r--kernel/cgroup/pids.c176
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/cgroup/rstat.c726
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/configs/android-base.config160
-rw-r--r--kernel/configs/android-recommended.config127
-rw-r--r--kernel/configs/debug.config18
-rw-r--r--kernel/configs/hardening.config113
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config2
-rw-r--r--kernel/configs/rust.config2
-rw-r--r--kernel/configs/tiny-base.config2
-rw-r--r--kernel/configs/tiny.config12
-rw-r--r--kernel/configs/x86_debug.config18
-rw-r--r--kernel/configs/xen.config6
-rw-r--r--kernel/context_tracking.c626
-rw-r--r--kernel/cpu.c1069
-rw-r--r--kernel/cpu_pm.c21
-rw-r--r--kernel/crash_core.c970
-rw-r--r--kernel/crash_core_test.c343
-rw-r--r--kernel/crash_dump_dm_crypt.c464
-rw-r--r--kernel/crash_reserve.c540
-rw-r--r--kernel/cred.c373
-rw-r--r--kernel/debug/debug_core.c55
-rw-r--r--kernel/debug/gdbstub.c31
-rw-r--r--kernel/debug/kdb/kdb_bp.c6
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c251
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c39
-rw-r--r--kernel/debug/kdb/kdb_main.c296
-rw-r--r--kernel/debug/kdb/kdb_private.h8
-rw-r--r--kernel/debug/kdb/kdb_support.c62
-rw-r--r--kernel/delayacct.c128
-rw-r--r--kernel/dma/Kconfig83
-rw-r--r--kernel/dma/Makefile6
-rw-r--r--kernel/dma/coherent.c38
-rw-r--r--kernel/dma/contiguous.c134
-rw-r--r--kernel/dma/debug.c385
-rw-r--r--kernel/dma/debug.h57
-rw-r--r--kernel/dma/direct.c277
-rw-r--r--kernel/dma/direct.h67
-rw-r--r--kernel/dma/dummy.c28
-rw-r--r--kernel/dma/map_benchmark.c67
-rw-r--r--kernel/dma/mapping.c429
-rw-r--r--kernel/dma/ops_helpers.c18
-rw-r--r--kernel/dma/pool.c18
-rw-r--r--kernel/dma/remap.c12
-rw-r--r--kernel/dma/swiotlb.c1641
-rw-r--r--kernel/elfcorehdr.c (renamed from kernel/crash_dump.c)0
-rw-r--r--kernel/entry/Makefile8
-rw-r--r--kernel/entry/common.c353
-rw-r--r--kernel/entry/syscall-common.c104
-rw-r--r--kernel/entry/syscall_user_dispatch.c94
-rw-r--r--kernel/entry/virt.c (renamed from kernel/entry/kvm.c)24
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/callchain.c135
-rw-r--r--kernel/events/core.c6260
-rw-r--r--kernel/events/hw_breakpoint.c677
-rw-r--r--kernel/events/hw_breakpoint_test.c332
-rw-r--r--kernel/events/internal.h17
-rw-r--r--kernel/events/ring_buffer.c99
-rw-r--r--kernel/events/uprobes.c1750
-rw-r--r--kernel/exit.c641
-rw-r--r--kernel/exit.h30
-rw-r--r--kernel/extable.c28
-rw-r--r--kernel/fail_function.c31
-rw-r--r--kernel/fork.c1435
-rw-r--r--kernel/freezer.c153
-rw-r--r--kernel/futex/core.c1193
-rw-r--r--kernel/futex/futex.h191
-rw-r--r--kernel/futex/pi.c347
-rw-r--r--kernel/futex/requeue.c486
-rw-r--r--kernel/futex/syscalls.c346
-rw-r--r--kernel/futex/waitwake.c342
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/gcov/clang.c8
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/gcov/gcc_4_7.c33
-rwxr-xr-xkernel/gen_kheaders.sh121
-rw-r--r--kernel/groups.c15
-rw-r--r--kernel/hung_task.c214
-rw-r--r--kernel/iomem.c18
-rw-r--r--kernel/irq/Kconfig42
-rw-r--r--kernel/irq/Makefile4
-rw-r--r--kernel/irq/affinity.c409
-rw-r--r--kernel/irq/autoprobe.c26
-rw-r--r--kernel/irq/chip.c926
-rw-r--r--kernel/irq/cpuhotplug.c56
-rw-r--r--kernel/irq/debugfs.c33
-rw-r--r--kernel/irq/devres.c171
-rw-r--r--kernel/irq/generic-chip.c206
-rw-r--r--kernel/irq/handle.c59
-rw-r--r--kernel/irq/internals.h111
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/ipi.c22
-rw-r--r--kernel/irq/irq_sim.c89
-rw-r--r--kernel/irq/irq_test.c236
-rw-r--r--kernel/irq/irqdesc.c552
-rw-r--r--kernel/irq/irqdomain.c991
-rw-r--r--kernel/irq/kexec.c36
-rw-r--r--kernel/irq/manage.c1578
-rw-r--r--kernel/irq/matrix.c36
-rw-r--r--kernel/irq/migration.c24
-rw-r--r--kernel/irq/msi.c1216
-rw-r--r--kernel/irq/pm.c67
-rw-r--r--kernel/irq/proc.c109
-rw-r--r--kernel/irq/resend.c80
-rw-r--r--kernel/irq/settings.h6
-rw-r--r--kernel/irq/spurious.c137
-rw-r--r--kernel/irq/timings.c1
-rw-r--r--kernel/irq_work.c12
-rw-r--r--kernel/jump_label.c269
-rw-r--r--kernel/kallsyms.c415
-rw-r--r--kernel/kallsyms_internal.h19
-rw-r--r--kernel/kallsyms_selftest.c446
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcmp.c8
-rw-r--r--kernel/kcov.c188
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kcsan/Makefile3
-rw-r--r--kernel/kcsan/core.c69
-rw-r--r--kernel/kcsan/debugfs.c79
-rw-r--r--kernel/kcsan/kcsan_test.c102
-rw-r--r--kernel/kcsan/report.c3
-rw-r--r--kernel/kcsan/selftest.c23
-rw-r--r--kernel/kexec.c35
-rw-r--r--kernel/kexec_core.c774
-rw-r--r--kernel/kexec_elf.c2
-rw-r--r--kernel/kexec_file.c598
-rw-r--r--kernel/kexec_internal.h36
-rw-r--r--kernel/kheaders.c27
-rw-r--r--kernel/kprobes.c1178
-rw-r--r--kernel/kstack_erase.c177
-rw-r--r--kernel/ksyms_common.c43
-rw-r--r--kernel/ksysfs.c128
-rw-r--r--kernel/kthread.c369
-rw-r--r--kernel/latencytop.c50
-rw-r--r--kernel/livepatch/Kconfig12
-rw-r--r--kernel/livepatch/core.c199
-rw-r--r--kernel/livepatch/patch.c25
-rw-r--r--kernel/livepatch/transition.c227
-rw-r--r--kernel/liveupdate/Kconfig75
-rw-r--r--kernel/liveupdate/Makefile12
-rw-r--r--kernel/liveupdate/kexec_handover.c1594
-rw-r--r--kernel/liveupdate/kexec_handover_debug.c25
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c221
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h55
-rw-r--r--kernel/liveupdate/luo_core.c450
-rw-r--r--kernel/liveupdate/luo_file.c889
-rw-r--r--kernel/liveupdate/luo_internal.h110
-rw-r--r--kernel/liveupdate/luo_session.c646
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lock_events.c10
-rw-r--r--kernel/locking/lock_events.h8
-rw-r--r--kernel/locking/lock_events_list.h33
-rw-r--r--kernel/locking/lockdep.c705
-rw-r--r--kernel/locking/lockdep_internals.h28
-rw-r--r--kernel/locking/lockdep_proc.c59
-rw-r--r--kernel/locking/locktorture.c580
-rw-r--r--kernel/locking/mcs_spinlock.h10
-rw-r--r--kernel/locking/mutex-debug.c31
-rw-r--r--kernel/locking/mutex.c159
-rw-r--r--kernel/locking/mutex.h35
-rw-r--r--kernel/locking/osq_lock.c40
-rw-r--r--kernel/locking/percpu-rwsem.c36
-rw-r--r--kernel/locking/qrwlock.c21
-rw-r--r--kernel/locking/qspinlock.c209
-rw-r--r--kernel/locking/qspinlock.h201
-rw-r--r--kernel/locking/qspinlock_paravirt.h105
-rw-r--r--kernel/locking/rtmutex.c383
-rw-r--r--kernel/locking/rtmutex_api.c100
-rw-r--r--kernel/locking/rtmutex_common.h63
-rw-r--r--kernel/locking/rwbase_rt.c32
-rw-r--r--kernel/locking/rwsem.c324
-rw-r--r--kernel/locking/semaphore.c97
-rw-r--r--kernel/locking/spinlock.c72
-rw-r--r--kernel/locking/spinlock_debug.c5
-rw-r--r--kernel/locking/spinlock_rt.c25
-rw-r--r--kernel/locking/test-ww_mutex.c68
-rw-r--r--kernel/locking/ww_mutex.h85
-rw-r--r--kernel/locking/ww_rt_mutex.c2
-rw-r--r--kernel/module-internal.h50
-rw-r--r--kernel/module.c4825
-rw-r--r--kernel/module/Kconfig465
-rw-r--r--kernel/module/Makefile25
-rw-r--r--kernel/module/debug_kmemleak.c21
-rw-r--r--kernel/module/decompress.c (renamed from kernel/module_decompress.c)125
-rw-r--r--kernel/module/dups.c247
-rw-r--r--kernel/module/internal.h425
-rw-r--r--kernel/module/kallsyms.c501
-rw-r--r--kernel/module/kdb.c63
-rw-r--r--kernel/module/kmod.c (renamed from kernel/kmod.c)50
-rw-r--r--kernel/module/livepatch.c74
-rw-r--r--kernel/module/main.c3923
-rw-r--r--kernel/module/procfs.c152
-rw-r--r--kernel/module/signing.c125
-rw-r--r--kernel/module/stats.c432
-rw-r--r--kernel/module/strict_rwx.c151
-rw-r--r--kernel/module/sysfs.c440
-rw-r--r--kernel/module/tracking.c127
-rw-r--r--kernel/module/tree_lookup.c112
-rw-r--r--kernel/module/version.c146
-rw-r--r--kernel/module_signing.c45
-rw-r--r--kernel/notifier.c122
-rw-r--r--kernel/nscommon.c311
-rw-r--r--kernel/nsproxy.c126
-rw-r--r--kernel/nstree.c813
-rw-r--r--kernel/padata.c246
-rw-r--r--kernel/panic.c603
-rw-r--r--kernel/params.c193
-rw-r--r--kernel/pid.c387
-rw-r--r--kernel/pid_namespace.c109
-rw-r--r--kernel/pid_sysctl.h53
-rw-r--r--kernel/power/Kconfig82
-rw-r--r--kernel/power/Makefile10
-rw-r--r--kernel/power/autosleep.c1
-rw-r--r--kernel/power/console.c15
-rw-r--r--kernel/power/em_netlink.c308
-rw-r--r--kernel/power/em_netlink.h39
-rw-r--r--kernel/power/em_netlink_autogen.c48
-rw-r--r--kernel/power/em_netlink_autogen.h23
-rw-r--r--kernel/power/energy_model.c786
-rw-r--r--kernel/power/hibernate.c470
-rw-r--r--kernel/power/main.c508
-rw-r--r--kernel/power/power.h45
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c52
-rw-r--r--kernel/power/qos.c124
-rw-r--r--kernel/power/snapshot.c396
-rw-r--r--kernel/power/suspend.c71
-rw-r--r--kernel/power/suspend_test.c10
-rw-r--r--kernel/power/swap.c691
-rw-r--r--kernel/power/user.c46
-rw-r--r--kernel/power/wakelock.c14
-rw-r--r--kernel/printk/.kunitconfig3
-rw-r--r--kernel/printk/Makefile4
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/index.c2
-rw-r--r--kernel/printk/internal.h250
-rw-r--r--kernel/printk/nbcon.c1985
-rw-r--r--kernel/printk/printk.c2518
-rw-r--r--kernel/printk/printk_ringbuffer.c441
-rw-r--r--kernel/printk/printk_ringbuffer.h59
-rw-r--r--kernel/printk/printk_ringbuffer_kunit_test.c327
-rw-r--r--kernel/printk/printk_safe.c60
-rw-r--r--kernel/printk/sysctl.c7
-rw-r--r--kernel/profile.c328
-rw-r--r--kernel/ptrace.c472
-rw-r--r--kernel/rcu/Kconfig213
-rw-r--r--kernel/rcu/Kconfig.debug125
-rw-r--r--kernel/rcu/rcu.h198
-rw-r--r--kernel/rcu/rcu_segcblist.c25
-rw-r--r--kernel/rcu/rcu_segcblist.h18
-rw-r--r--kernel/rcu/rcuscale.c594
-rw-r--r--kernel/rcu/rcutorture.c2139
-rw-r--r--kernel/rcu/refscale.c854
-rw-r--r--kernel/rcu/srcutiny.c70
-rw-r--r--kernel/rcu/srcutree.c1538
-rw-r--r--kernel/rcu/sync.c24
-rw-r--r--kernel/rcu/tasks.h1224
-rw-r--r--kernel/rcu/tiny.c70
-rw-r--r--kernel/rcu/tree.c3350
-rw-r--r--kernel/rcu/tree.h140
-rw-r--r--kernel/rcu/tree_exp.h587
-rw-r--r--kernel/rcu/tree_nocb.h913
-rw-r--r--kernel/rcu/tree_plugin.h383
-rw-r--r--kernel/rcu/tree_stall.h399
-rw-r--r--kernel/rcu/update.c118
-rw-r--r--kernel/reboot.c624
-rw-r--r--kernel/regset.c6
-rw-r--r--kernel/relay.c397
-rw-r--r--kernel/resource.c599
-rw-r--r--kernel/resource_kunit.c154
-rw-r--r--kernel/rseq.c527
-rw-r--r--kernel/scftorture.c74
-rw-r--r--kernel/sched/Makefile33
-rw-r--r--kernel/sched/autogroup.c39
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/build_policy.c66
-rw-r--r--kernel/sched/build_utility.c106
-rw-r--r--kernel/sched/clock.c84
-rw-r--r--kernel/sched/completion.c43
-rw-r--r--kernel/sched/core.c7080
-rw-r--r--kernel/sched/core_sched.c28
-rw-r--r--kernel/sched/cpuacct.c12
-rw-r--r--kernel/sched/cpudeadline.c40
-rw-r--r--kernel/sched/cpudeadline.h8
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c294
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/cpupri.h5
-rw-r--r--kernel/sched/cputime.c96
-rw-r--r--kernel/sched/deadline.c2080
-rw-r--r--kernel/sched/debug.c420
-rw-r--r--kernel/sched/ext.c7310
-rw-r--r--kernel/sched/ext.h95
-rw-r--r--kernel/sched/ext_idle.c1435
-rw-r--r--kernel/sched/ext_idle.h24
-rw-r--r--kernel/sched/ext_internal.h1101
-rw-r--r--kernel/sched/fair.c6957
-rw-r--r--kernel/sched/features.h62
-rw-r--r--kernel/sched/idle.c201
-rw-r--r--kernel/sched/isolation.c200
-rw-r--r--kernel/sched/loadavg.c13
-rw-r--r--kernel/sched/membarrier.c74
-rw-r--r--kernel/sched/pelt.c59
-rw-r--r--kernel/sched/pelt.h132
-rw-r--r--kernel/sched/psi.c1128
-rw-r--r--kernel/sched/rq-offsets.c12
-rw-r--r--kernel/sched/rt.c834
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h2376
-rw-r--r--kernel/sched/smp.h15
-rw-r--r--kernel/sched/stats.c28
-rw-r--r--kernel/sched/stats.h112
-rw-r--r--kernel/sched/stop_task.c55
-rw-r--r--kernel/sched/swait.c8
-rw-r--r--kernel/sched/syscalls.c1570
-rw-r--r--kernel/sched/topology.c1081
-rw-r--r--kernel/sched/wait.c106
-rw-r--r--kernel/sched/wait_bit.c99
-rw-r--r--kernel/scs.c28
-rw-r--r--kernel/seccomp.c270
-rw-r--r--kernel/signal.c1127
-rw-r--r--kernel/smp.c573
-rw-r--r--kernel/smpboot.c162
-rw-r--r--kernel/softirq.c345
-rw-r--r--kernel/stackleak.c148
-rw-r--r--kernel/stacktrace.c7
-rw-r--r--kernel/static_call.c542
-rw-r--r--kernel/static_call_inline.c566
-rw-r--r--kernel/stop_machine.c46
-rw-r--r--kernel/sys.c682
-rw-r--r--kernel/sys_ni.c143
-rw-r--r--kernel/sysctl-test.c44
-rw-r--r--kernel/sysctl.c2096
-rw-r--r--kernel/task_work.c113
-rw-r--r--kernel/taskstats.c55
-rw-r--r--kernel/time/Kconfig75
-rw-r--r--kernel/time/Makefile13
-rw-r--r--kernel/time/alarmtimer.c176
-rw-r--r--kernel/time/clockevents.c61
-rw-r--r--kernel/time/clocksource-wdtest.c17
-rw-r--r--kernel/time/clocksource.c288
-rw-r--r--kernel/time/hrtimer.c571
-rw-r--r--kernel/time/itimer.c21
-rw-r--r--kernel/time/jiffies.c130
-rw-r--r--kernel/time/namespace.c80
-rw-r--r--kernel/time/ntp.c888
-rw-r--r--kernel/time/ntp_internal.h17
-rw-r--r--kernel/time/posix-clock.c77
-rw-r--r--kernel/time/posix-cpu-timers.c391
-rw-r--r--kernel/time/posix-stubs.c50
-rw-r--r--kernel/time/posix-timers.c1119
-rw-r--r--kernel/time/posix-timers.h10
-rw-r--r--kernel/time/sched_clock.c92
-rw-r--r--kernel/time/sleep_timeout.c377
-rw-r--r--kernel/time/test_udelay.c3
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c32
-rw-r--r--kernel/time/tick-broadcast.c152
-rw-r--r--kernel/time/tick-common.c119
-rw-r--r--kernel/time/tick-internal.h20
-rw-r--r--kernel/time/tick-oneshot.c24
-rw-r--r--kernel/time/tick-sched.c866
-rw-r--r--kernel/time/tick-sched.h97
-rw-r--r--kernel/time/time.c182
-rw-r--r--kernel/time/time_test.c5
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c1625
-rw-r--r--kernel/time/timekeeping_debug.c13
-rw-r--r--kernel/time/timekeeping_internal.h36
-rw-r--r--kernel/time/timer.c1488
-rw-r--r--kernel/time/timer_list.c16
-rw-r--r--kernel/time/timer_migration.c2023
-rw-r--r--kernel/time/timer_migration.h146
-rw-r--r--kernel/time/vsyscall.c119
-rw-r--r--kernel/torture.c130
-rw-r--r--kernel/trace/Kconfig294
-rw-r--r--kernel/trace/Makefile29
-rw-r--r--kernel/trace/blktrace.c839
-rw-r--r--kernel/trace/bpf_trace.c1899
-rw-r--r--kernel/trace/bpf_trace.h2
-rw-r--r--kernel/trace/fgraph.c1280
-rw-r--r--kernel/trace/fprobe.c972
-rw-r--r--kernel/trace/ftrace.c3188
-rw-r--r--kernel/trace/ftrace_internal.h23
-rw-r--r--kernel/trace/kprobe_event_gen_test.c119
-rw-r--r--kernel/trace/pid_list.c49
-rw-r--r--kernel/trace/pid_list.h1
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/preemptirq_delay_test.c16
-rw-r--r--kernel/trace/rethook.c337
-rw-r--r--kernel/trace/ring_buffer.c3566
-rw-r--r--kernel/trace/ring_buffer_benchmark.c18
-rw-r--r--kernel/trace/rv/Kconfig95
-rw-r--r--kernel/trace/rv/Makefile23
-rw-r--r--kernel/trace/rv/monitors/nrp/Kconfig16
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c138
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h75
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp_trace.h15
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c168
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h104
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h15
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig21
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.c88
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.h64
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault_trace.h14
-rw-r--r--kernel/trace/rv/monitors/rtapp/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c33
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.h3
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig12
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c37
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c87
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h47
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c95
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h49
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sleep/Kconfig22
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c241
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h257
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep_trace.h14
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c95
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h59
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c84
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h47
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sssw/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c116
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h105
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sts/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c156
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h117
-rw-r--r--kernel/trace/rv/monitors/sts/sts_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c87
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h47
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c86
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h47
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/reactor_panic.c42
-rw-r--r--kernel/trace/rv/reactor_printk.c41
-rw-r--r--kernel/trace/rv/rv.c854
-rw-r--r--kernel/trace/rv/rv.h47
-rw-r--r--kernel/trace/rv/rv_reactors.c481
-rw-r--r--kernel/trace/rv/rv_trace.h212
-rw-r--r--kernel/trace/synth_event_gen_test.c29
-rw-r--r--kernel/trace/trace.c5157
-rw-r--r--kernel/trace/trace.h659
-rw-r--r--kernel/trace/trace_benchmark.c7
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_boot.c12
-rw-r--r--kernel/trace/trace_branch.c14
-rw-r--r--kernel/trace/trace_btf.c122
-rw-r--r--kernel/trace/trace_btf.h11
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_dynevent.c65
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_entries.h76
-rw-r--r--kernel/trace/trace_eprobe.c550
-rw-r--r--kernel/trace/trace_event_perf.c33
-rw-r--r--kernel/trace/trace_events.c1681
-rw-r--r--kernel/trace/trace_events_filter.c877
-rw-r--r--kernel/trace/trace_events_hist.c1358
-rw-r--r--kernel/trace/trace_events_inject.c9
-rw-r--r--kernel/trace/trace_events_synth.c253
-rw-r--r--kernel/trace/trace_events_trigger.c973
-rw-r--r--kernel/trace/trace_events_user.c2933
-rw-r--r--kernel/trace/trace_export.c12
-rw-r--r--kernel/trace/trace_fprobe.c1589
-rw-r--r--kernel/trace/trace_functions.c124
-rw-r--r--kernel/trace/trace_functions_graph.c738
-rw-r--r--kernel/trace/trace_hwlat.c24
-rw-r--r--kernel/trace/trace_irqsoff.c124
-rw-r--r--kernel/trace/trace_kdb.c32
-rw-r--r--kernel/trace/trace_kprobe.c703
-rw-r--r--kernel/trace/trace_kprobe_selftest.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c20
-rw-r--r--kernel/trace/trace_osnoise.c1030
-rw-r--r--kernel/trace/trace_output.c497
-rw-r--r--kernel/trace/trace_output.h22
-rw-r--r--kernel/trace/trace_preemptirq.c80
-rw-r--r--kernel/trace/trace_probe.c1462
-rw-r--r--kernel/trace/trace_probe.h171
-rw-r--r--kernel/trace/trace_probe_kernel.h119
-rw-r--r--kernel/trace/trace_probe_tmpl.h113
-rw-r--r--kernel/trace/trace_recursion_record.c7
-rw-r--r--kernel/trace/trace_sched_switch.c521
-rw-r--r--kernel/trace/trace_sched_wakeup.c87
-rw-r--r--kernel/trace/trace_selftest.c311
-rw-r--r--kernel/trace/trace_seq.c35
-rw-r--r--kernel/trace/trace_stack.c32
-rw-r--r--kernel/trace/trace_stat.c26
-rw-r--r--kernel/trace/trace_synth.h1
-rw-r--r--kernel/trace/trace_syscalls.c1046
-rw-r--r--kernel/trace/trace_uprobe.c383
-rw-r--r--kernel/trace/tracing_map.c29
-rw-r--r--kernel/trace/tracing_map.h4
-rw-r--r--kernel/tracepoint.c138
-rw-r--r--kernel/tsacct.c15
-rw-r--r--kernel/ucount.c167
-rw-r--r--kernel/umh.c82
-rw-r--r--kernel/unwind/Makefile1
-rw-r--r--kernel/unwind/deferred.c366
-rw-r--r--kernel/unwind/user.c165
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user.c28
-rw-r--r--kernel/user_namespace.c109
-rw-r--r--kernel/usermode_driver.c191
-rw-r--r--kernel/utsname.c33
-rw-r--r--kernel/utsname_sysctl.c30
-rw-r--r--kernel/vhost_task.c165
-rw-r--r--kernel/vmcore_info.c250
-rw-r--r--kernel/watch_queue.c177
-rw-r--r--kernel/watchdog.c843
-rw-r--r--kernel/watchdog_buddy.c110
-rw-r--r--kernel/watchdog_perf.c (renamed from kernel/watchdog_hld.c)194
-rw-r--r--kernel/workqueue.c4927
-rw-r--r--kernel/workqueue_internal.h26
639 files changed, 184933 insertions, 67113 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index c6b299a6b786..a501bfc80694 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
/config_data
/kheaders.md5
+/kheaders-objlist
+/kheaders-srclist
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 38ef6d06888e..ce1435cb08b1 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -30,7 +30,7 @@ choice
250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems. If you are going to be using NTSC video
- or multimedia, selected 300Hz instead.
+ or multimedia, select 300Hz instead.
config HZ_300
bool "300 HZ"
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index 000000000000..15632358bcf7
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_RESERVE
+ bool
+
+config VMCORE_INFO
+ bool
+
+config KEXEC_CORE
+ bool
+
+config KEXEC_ELF
+ bool
+
+config HAVE_IMA_KEXEC
+ bool
+
+config KEXEC
+ bool "Enable kexec system call"
+ depends on ARCH_SUPPORTS_KEXEC
+ select KEXEC_CORE
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "Enable kexec file based system call"
+ depends on ARCH_SUPPORTS_KEXEC_FILE
+ select CRYPTO_LIB_SHA256
+ select KEXEC_CORE
+ help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by kexec system call.
+
+config KEXEC_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG
+ depends on KEXEC_FILE
+ help
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image. The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+ bool "Require a valid signature in kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ depends on KEXEC_SIG
+ help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_IMAGE_VERIFY_SIG
+ bool "Enable Image signature verification support (ARM)"
+ default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
+ depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on EFI && SIGNED_PE_FILE_VERIFICATION
+ help
+ Enable Image signature verification support.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on ARCH_SUPPORTS_KEXEC_JUMP
+ depends on KEXEC && HIBERNATION
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ default ARCH_DEFAULT_CRASH_DUMP
+ depends on ARCH_SUPPORTS_CRASH_DUMP
+ depends on KEXEC_CORE
+ select VMCORE_INFO
+ select CRASH_RESERVE
+ help
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/admin-guide/kdump/kdump.rst
+
+ For s390, this option also enables zfcpdump.
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
+
+config CRASH_DM_CRYPT
+ bool "Support saving crash dump to dm-crypt encrypted volume"
+ depends on KEXEC_FILE
+ depends on CRASH_DUMP
+ depends on DM_CRYPT
+ depends on KEYS
+ help
+ With this option enabled, user space can intereact with
+ /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys
+ persistent for the dump-capture kernel.
+
+config CRASH_DM_CRYPT_CONFIGS
+ def_tristate CRASH_DM_CRYPT
+ select CONFIGFS_FS
+ help
+ CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
+ is required to be built-in.
+
+config CRASH_DUMP_KUNIT_TEST
+ tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS
+ depends on CRASH_DUMP && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This option builds KUnit unit tests for kernel crash dumps. The unit
+ tests will be used to verify the correctness of covered functions and
+ also prevent any regression.
+
+ If unsure, say N.
+
+config CRASH_HOTPLUG
+ bool "Update the crash elfcorehdr on system configuration changes"
+ default y
+ depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+ depends on ARCH_SUPPORTS_CRASH_HOTPLUG
+ help
+ Enable direct update to the crash elfcorehdr (which contains
+ the list of CPUs and memory regions to be dumped upon a crash)
+ in response to hot plug/unplug or online/offline of CPUs or
+ memory. This is a much more advanced approach than userspace
+ attempting that.
+
+ If unsure, say Y.
+
+config CRASH_MAX_MEMORY_RANGES
+ int "Specify the maximum number of memory regions for the elfcorehdr"
+ default 8192
+ depends on CRASH_HOTPLUG
+ help
+ For the kexec_file_load() syscall path, specify the maximum number of
+ memory regions that the elfcorehdr buffer/segment can accommodate.
+ These regions are obtained via walk_system_ram_res(); eg. the
+ 'System RAM' entries in /proc/iomem.
+ This value is combined with NR_CPUS_DEFAULT and multiplied by
+ sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/
+ segment size.
+ The value 8192, for example, covers a (sparsely populated) 1TiB system
+ consisting of 128MiB memblocks, while resulting in an elfcorehdr
+ memory buffer/segment size under 1MiB. This represents a sane choice
+ to accommodate both baremetal and virtual machine configurations.
+
+ For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of
+ the computation behind the value provided through the
+ /sys/kernel/crash_elfcorehdr_size attribute.
+
+endmenu
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..da326800c1c9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,12 +11,16 @@ config PREEMPT_BUILD
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+config ARCH_HAS_PREEMPT_LAZY
+ bool
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ depends on !PREEMPT_RT
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
@@ -32,6 +36,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ depends on !PREEMPT_RT
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
@@ -51,7 +56,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_BUILD
+ select PREEMPT_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -67,9 +72,23 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_LAZY
+ bool "Scheduler controlled preemption model"
+ depends on !ARCH_NO_PREEMPT
+ depends on ARCH_HAS_PREEMPT_LAZY
+ select PREEMPT_BUILD if !PREEMPT_DYNAMIC
+ help
+ This option provides a scheduler driven preemption model that
+ is fundamentally similar to full preemption, but is less
+ eager to preempt SCHED_NORMAL tasks in an attempt to
+ reduce lock holder preemption and recover some of the performance
+ gains seen from using Voluntary preemption.
+
+endchoice
+
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
- depends on EXPERT && ARCH_SUPPORTS_RT
+ depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST
select PREEMPTION
help
This option turns the kernel into a real-time kernel by replacing
@@ -84,7 +103,18 @@ config PREEMPT_RT
Select this if you are building a kernel for systems which
require real-time guarantees.
-endchoice
+config PREEMPT_RT_NEEDS_BH_LOCK
+ bool "Enforce softirq synchronisation on PREEMPT_RT"
+ depends on PREEMPT_RT
+ help
+ Enforce synchronisation across the softirqs context. On PREEMPT_RT
+ the softirq is preemptible. This enforces the same per-CPU BLK
+ semantic non-PREEMPT_RT builds have. This should not be needed
+ because per-CPU locks were added to avoid the per-CPU BKL.
+
+ This switch provides the old behaviour for testing reasons. Select
+ this if you suspect an error with preemptible softirq and want test
+ the old synchronized behaviour.
config PREEMPT_COUNT
bool
@@ -95,9 +125,10 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ depends on HAVE_PREEMPT_DYNAMIC
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
- default y
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
@@ -132,4 +163,29 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ select STACKTRACE if STACKTRACE_SUPPORT
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
+ achieve the following:
+
+ - Ease of experimentation and exploration: Enabling rapid
+ iteration of new scheduling policies.
+ - Customization: Building application-specific schedulers which
+ implement policies that are not applicable to general-purpose
+ schedulers.
+ - Rapid scheduler deployments: Non-disruptive swap outs of
+ scheduling policies in production environments.
+
+ sched_ext leverages BPF struct_ops feature to define a structure
+ which exports function callbacks and flags to BPF programs that
+ wish to implement scheduling policies. The struct_ops structure
+ exported by sched_ext is struct sched_ext_ops, and is conceptually
+ similar to struct sched_class.
+
+ For more information:
+ Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx
diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..e83669841b8c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,19 +8,23 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o regset.o
+ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
-obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
-obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING
+endif
+
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
@@ -29,7 +33,6 @@ KCOV_INSTRUMENT_softirq.o := n
KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
@@ -39,11 +42,9 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# Don't instrument error handlers
-CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
-
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -51,8 +52,11 @@ obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-y += liveupdate/
obj-y += dma/
obj-y += entry/
+obj-y += unwind/
+obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -66,14 +70,16 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
+obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
+obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
@@ -95,7 +101,8 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -108,19 +115,20 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
-obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
+obj-$(CONFIG_CFI) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
@@ -132,11 +140,12 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
-CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
-KASAN_SANITIZE_stackleak.o := n
-KCSAN_SANITIZE_stackleak.o := n
-KCOV_INSTRUMENT_stackleak.o := n
+CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE)
+CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only)
+obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o
+KASAN_SANITIZE_kstack_erase.o := n
+KCSAN_SANITIZE_kstack_erase.o := n
+KCOV_INSTRUMENT_kstack_erase.o := n
obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
@@ -151,11 +160,48 @@ filechk_cat = cat $<
$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
$(call filechk,cat)
+# kheaders_data.tar.xz
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
-quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
- cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
-$(obj)/kheaders_data.tar.xz: FORCE
- $(call cmd,genikh)
+quiet_cmd_kheaders_data = GEN $@
+ cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)"
+ cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile)
+
+define rule_kheaders_data
+ $(call cmd_and_savecmd,kheaders_data)
+ $(call cmd,kheaders_data_dep)
+endef
+
+targets += kheaders_data.tar.xz
+$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE
+ $(call if_changed_rule,kheaders_data)
+
+# generated headers in objtree
+#
+# include/generated/utsversion.h is ignored because it is generated
+# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders)
+filechk_kheaders_objlist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \
+ done
+
+$(obj)/kheaders-objlist: FORCE
+ $(call filechk,kheaders_objlist)
+
+# non-generated headers in srctree
+filechk_kheaders_srclist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \
+ done
+
+$(obj)/kheaders-srclist: FORCE
+ $(call filechk,kheaders_srclist)
+
+# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should
+# be rebuilt.
+filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum
+
+$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE
+ $(call filechk,kheaders_md5sum)
-clean-files := kheaders_data.tar.xz kheaders.md5
+clean-files := kheaders.md5 kheaders-srclist kheaders-objlist
diff --git a/kernel/acct.c b/kernel/acct.c
index 3df53cf1dcd5..2a2b3c874acd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -44,19 +44,14 @@
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
-#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
-#include <linux/file.h>
#include <linux/tty.h>
-#include <linux/security.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/syscalls.h>
-#include <linux/mount.h>
-#include <linux/uaccess.h>
+#include <linux/namei.h>
#include <linux/sched/cputime.h>
#include <asm/div64.h>
@@ -70,11 +65,30 @@
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_acct_table[] = {
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_acct_table);
+ return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* External references and all of the globals.
*/
@@ -84,48 +98,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
- int active;
+ bool active;
+ bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
+ acct_t ac;
};
-static void do_acct_process(struct bsd_acct_struct *acct);
+static void fill_ac(struct bsd_acct_struct *acct);
+static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct)
+static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_after_jiffies(acct->needcheck))
- goto out;
+ if (!acct->check_space)
+ return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
- goto out;
+ return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
- acct->active = 0;
+ acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
- acct->active = 1;
+ acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-out:
return acct->active;
}
@@ -170,7 +186,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
- do_acct_process(acct);
+ /*
+ * Fill the accounting struct with the exiting task's info
+ * before punting to the workqueue.
+ */
+ fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -183,76 +203,79 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
+
+ /* We were fired by acct_pin_kill() which holds acct->lock. */
+ acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
complete(&acct->done);
}
-static int acct_on(struct filename *pathname)
+DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T))
+static int acct_on(const char __user *name)
{
- struct file *file;
- struct vfsmount *mnt, *internal;
+ /* Difference from BSD - they don't do O_APPEND */
+ const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE;
struct pid_namespace *ns = task_active_pid_ns(current);
+ struct filename *pathname __free(putname) = getname(name);
+ struct file *original_file __free(fput) = NULL; // in that order
+ struct path internal __free(path_put) = {}; // in that order
+ struct file *file __free(fput_sync) = NULL; // in that order
struct bsd_acct_struct *acct;
+ struct vfsmount *mnt;
struct fs_pin *old;
- int err;
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (!acct)
- return -ENOMEM;
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+ original_file = file_open_name(pathname, open_flags, 0);
+ if (IS_ERR(original_file))
+ return PTR_ERR(original_file);
- /* Difference from BSD - they don't do O_APPEND */
- file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file)) {
- kfree(acct);
+ mnt = mnt_clone_internal(&original_file->f_path);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ internal.mnt = mnt;
+ internal.dentry = dget(mnt->mnt_root);
+
+ file = dentry_open(&internal, open_flags, current_cred());
+ if (IS_ERR(file))
return PTR_ERR(file);
- }
- if (!S_ISREG(file_inode(file)->i_mode)) {
- kfree(acct);
- filp_close(file, NULL);
+ if (!S_ISREG(file_inode(file)->i_mode))
return -EACCES;
- }
- if (!(file->f_mode & FMODE_CAN_WRITE)) {
- kfree(acct);
- filp_close(file, NULL);
+ /* Exclude kernel kernel internal filesystems. */
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT))
+ return -EINVAL;
+
+ /* Exclude procfs and sysfs. */
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
+ return -EINVAL;
+
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EIO;
- }
- internal = mnt_clone_internal(&file->f_path);
- if (IS_ERR(internal)) {
- kfree(acct);
- filp_close(file, NULL);
- return PTR_ERR(internal);
- }
- err = __mnt_want_write(internal);
- if (err) {
- mntput(internal);
- kfree(acct);
- filp_close(file, NULL);
- return err;
- }
- mnt = file->f_path.mnt;
- file->f_path.mnt = internal;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
atomic_long_set(&acct->count, 1);
init_fs_pin(&acct->pin, acct_pin_kill);
- acct->file = file;
+ acct->file = no_free_ptr(file);
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
- pin_insert(&acct->pin, mnt);
+ pin_insert(&acct->pin, original_file->f_path.mnt);
rcu_read_lock();
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- __mnt_drop_write(mnt);
- mntput(mnt);
return 0;
}
@@ -277,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- struct filename *tmp = getname(name);
-
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
- error = acct_on(tmp);
+ error = acct_on(name);
mutex_unlock(&acct_on_mutex);
- putname(tmp);
} else {
rcu_read_lock();
pin_kill(task_active_pid_ns(current)->bacct);
@@ -300,7 +318,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -311,7 +329,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -330,6 +348,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
@@ -409,13 +429,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-static void fill_ac(acct_t *ac)
+static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
+ lockdep_assert_held(&acct->lock);
+
+ if (time_is_after_jiffies(acct->needcheck)) {
+ acct->check_space = false;
+
+ /* Don't fill in @ac if nothing will be written. */
+ if (!acct->active)
+ return;
+ } else {
+ acct->check_space = true;
+ }
+
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@@ -423,7 +457,7 @@ static void fill_ac(acct_t *ac)
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+ strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -448,7 +482,7 @@ static void fill_ac(acct_t *ac)
do_div(elapsed, AHZ);
btime = ktime_get_real_seconds() - elapsed;
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
-#if ACCT_VERSION==2
+#if ACCT_VERSION == 2
ac->ac_ahz = AHZ;
#endif
@@ -463,64 +497,58 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct)
-{
- acct_t ac;
- unsigned long flim;
- const struct cred *orig_cred;
- struct file *file = acct->file;
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = rlimit(RLIMIT_FSIZE);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct))
- goto out;
-
- fill_ac(&ac);
/* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
+ ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
+ ac->ac_uid16 = ac->ac_uid;
+ ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
- ac.ac_pid = task_tgid_nr_ns(current, ns);
+ ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
- ns);
+ ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
- /*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
- */
- if (file_start_write_trylock(file)) {
- /* it's been opened O_APPEND, so position is irrelevant */
- loff_t pos = 0;
- __kernel_write(file, &ac, sizeof(acct_t), &pos);
- file_end_write(file);
+}
+
+static void acct_write_process(struct bsd_acct_struct *acct)
+{
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ scoped_with_creds(file->f_cred) {
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
+ */
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
}
-out:
+}
+
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ unsigned long flim;
+
+ /* Accounting records are not subject to resource limits. */
+ flim = rlimit(RLIMIT_FSIZE);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ fill_ac(acct);
+ acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- revert_creds(orig_cred);
}
/**
@@ -535,15 +563,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index b8d7a663497f..4c3e6a44595f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
#include <linux/async.h>
#include <linux/atomic.h>
-#include <linux/ktime.h>
#include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/wait.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
@@ -63,6 +64,7 @@ static async_cookie_t next_cookie = 1;
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
+static struct workqueue_struct *async_wq;
struct async_entry {
struct list_head domain_list;
@@ -145,6 +147,39 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
+{
+ async_cookie_t newcookie;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* schedule for execution */
+ queue_work_node(node, async_wq, &entry->work);
+
+ return newcookie;
+}
+
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
@@ -186,32 +221,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
func(data, newcookie);
return newcookie;
}
- INIT_LIST_HEAD(&entry->domain_list);
- INIT_LIST_HEAD(&entry->global_list);
- INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = func;
- entry->data = data;
- entry->domain = domain;
-
- spin_lock_irqsave(&async_lock, flags);
-
- /* allocate cookie and queue */
- newcookie = entry->cookie = next_cookie++;
-
- list_add_tail(&entry->domain_list, &domain->pending);
- if (domain->registered)
- list_add_tail(&entry->global_list, &async_global_pending);
- atomic_inc(&entry_count);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
- /* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
-
- return newcookie;
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
@@ -235,6 +246,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
+}
+
+/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
@@ -306,3 +346,17 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);
+
+void __init async_init(void)
+{
+ /*
+ * Async can schedule a number of interdependent work items. However,
+ * unbound workqueues can handle only upto min_active interdependent
+ * work items. The default min_active of 8 isn't sufficient for async
+ * and can lead to stalls. Let's use a dedicated workqueue with raised
+ * min_active.
+ */
+ async_wq = alloc_workqueue("async", WQ_UNBOUND, 0);
+ BUG_ON(!async_wq);
+ workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE);
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index e4bbe2c70c26..26a332ffb1b8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -53,9 +53,8 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#ifdef CONFIG_SECURITY
#include <linux/security.h>
-#endif
+#include <linux/lsm_hooks.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -83,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK;
/* private audit network namespace index */
static unsigned int audit_net_id;
+/* Number of modules that provide a security context.
+ List of lsms that provide a security context */
+static u32 audit_subj_secctx_cnt;
+static u32 audit_obj_secctx_cnt;
+static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT];
+static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT];
+
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
@@ -125,7 +131,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
static kuid_t audit_sig_uid = INVALID_UID;
static pid_t audit_sig_pid = -1;
-static u32 audit_sig_sid;
+static struct lsm_prop audit_sig_lsm;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -197,8 +203,10 @@ static struct audit_ctl_mutex {
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
- struct sk_buff *skb; /* formatted skb ready to send */
+ struct sk_buff *skb; /* the skb for audit_log functions */
+ struct sk_buff_head skb_list; /* formatted skbs, ready to send */
struct audit_context *ctx; /* NULL or associated context */
+ struct audit_stamp stamp; /* audit stamp for these records */
gfp_t gfp_mask;
};
@@ -281,6 +289,33 @@ static pid_t auditd_pid_vnr(void)
}
/**
+ * audit_cfg_lsm - Identify a security module as providing a secctx.
+ * @lsmid: LSM identity
+ * @flags: which contexts are provided
+ *
+ * Description:
+ * Increments the count of the security modules providing a secctx.
+ * If the LSM id is already in the list leave it alone.
+ */
+void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{
+ int i;
+
+ if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) {
+ for (i = 0 ; i < audit_subj_secctx_cnt; i++)
+ if (audit_subj_lsms[i] == lsmid)
+ return;
+ audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid;
+ }
+ if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) {
+ for (i = 0 ; i < audit_obj_secctx_cnt; i++)
+ if (audit_obj_lsms[i] == lsmid)
+ return;
+ audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid;
+ }
+}
+
+/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
@@ -321,18 +356,17 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
- if (!audit_rate_limit) return 1;
+ if (!audit_rate_limit)
+ return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -366,7 +400,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
@@ -490,15 +524,19 @@ static void auditd_conn_free(struct rcu_head *rcu)
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
-static int auditd_set(struct pid *pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+ struct sk_buff *skb, bool *ack)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
+ struct nlmsghdr *nlh;
if (!pid || !net)
return -EINVAL;
@@ -510,6 +548,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
ac_new->portid = portid;
ac_new->net = get_net(net);
+ /* send the ack now to avoid a race with the queue backlog */
+ if (*ack) {
+ nlh = nlmsg_hdr(skb);
+ netlink_ack(skb, nlh, 0, NULL);
+ *ack = false;
+ }
+
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
@@ -541,20 +586,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -564,19 +611,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -585,24 +644,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -640,7 +707,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -714,16 +781,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -731,7 +800,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
@@ -745,7 +814,7 @@ retry:
rc == -ECONNREFUSED || rc == -EPERM) {
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, rc);
if (rc == -EAGAIN)
rc = 0;
/* continue to drain the queue */
@@ -1076,12 +1145,11 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
-
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
@@ -1179,7 +1247,8 @@ static int audit_replace(struct pid *pid)
return auditd_send_unicast_skb(skb);
}
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ bool *ack)
{
u32 seq;
void *data;
@@ -1188,8 +1257,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
struct audit_sig_info *sig_data;
- char *ctx = NULL;
- u32 len;
+ struct lsm_context lsmctx = { NULL, 0, 0 };
err = audit_netlink_ok(skb, msg_type);
if (err)
@@ -1272,7 +1340,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
- sock_net(NETLINK_CB(skb).sk));
+ sock_net(NETLINK_CB(skb).sk),
+ skb, ack);
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
@@ -1366,7 +1435,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
str);
} else {
audit_log_format(ab, " data=");
- if (data_len > 0 && str[data_len - 1] == '\0')
+ if (str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
@@ -1438,26 +1507,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
case AUDIT_SIGNAL_INFO:
- len = 0;
- if (audit_sig_sid) {
- err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
- if (err)
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ err = security_lsmprop_to_secctx(&audit_sig_lsm,
+ &lsmctx, LSM_ID_UNDEF);
+ if (err < 0)
return err;
}
- sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len),
+ GFP_KERNEL);
if (!sig_data) {
- if (audit_sig_sid)
- security_release_secctx(ctx, len);
+ if (lsmprop_is_set(&audit_sig_lsm))
+ security_release_secctx(&lsmctx);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
- if (audit_sig_sid) {
- memcpy(sig_data->ctx, ctx, len);
- security_release_secctx(ctx, len);
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ memcpy(sig_data->ctx, lsmctx.context, lsmctx.len);
+ security_release_secctx(&lsmctx);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, struct_size(sig_data, ctx, len));
+ sig_data, struct_size(sig_data, ctx,
+ lsmctx.len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1517,9 +1588,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
-static void audit_receive(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
+ bool ack;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
@@ -1532,9 +1604,12 @@ static void audit_receive(struct sk_buff *skb)
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
- err = audit_receive_msg(skb, nlh);
- /* if err or if this message says it wants a response */
- if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+ ack = nlh->nlmsg_flags & NLM_F_ACK;
+ err = audit_receive_msg(skb, nlh, &ack);
+
+ /* send an ack if the user asked for one and audit_receive_msg
+ * didn't already do it, or if there was an error. */
+ if (ack || err)
netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
@@ -1574,7 +1649,7 @@ static void audit_log_multicast(int group, const char *op, int err)
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
- task_pid_nr(current),
+ task_tgid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
tty ? tty_name(tty) : "(none)",
@@ -1655,9 +1730,7 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- audit_buffer_cache = kmem_cache_create("audit_buffer",
- sizeof(struct audit_buffer),
- 0, SLAB_PANIC, NULL);
+ audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
@@ -1670,7 +1743,7 @@ static int __init audit_init(void)
audit_cmd_mutex.owner = NULL;
pr_info("initializing netlink subsys (%s)\n",
- audit_default ? "enabled" : "disabled");
+ str_enabled_disabled(audit_default));
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
@@ -1739,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
+ struct sk_buff *skb;
+
if (!ab)
return;
- kfree_skb(ab->skb);
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ kfree_skb(skb);
kmem_cache_free(audit_buffer_cache, ab);
}
@@ -1755,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
if (!ab)
return NULL;
+ skb_queue_head_init(&ab->skb_list);
+
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
+
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
goto err;
@@ -1796,11 +1877,11 @@ unsigned int audit_serial(void)
}
static inline void audit_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+ struct audit_stamp *stamp)
{
- if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_coarse_real_ts64(t);
- *serial = audit_serial();
+ if (!ctx || !auditsc_get_stamp(ctx, stamp)) {
+ ktime_get_coarse_real_ts64(&stamp->ctime);
+ stamp->serial = audit_serial();
}
}
@@ -1823,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
- struct timespec64 t;
- unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1879,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
}
- audit_get_stamp(ab->ctx, &t, &serial);
+ audit_get_stamp(ab->ctx, &ab->stamp);
/* cancel dummy context to enable supporting records */
if (ctx)
ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
- (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
return ab;
}
@@ -1919,8 +2000,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
-static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
- va_list args)
+static __printf(2, 0)
+void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args)
{
int len, avail;
struct sk_buff *skb;
@@ -2066,8 +2147,8 @@ bool audit_string_contains_control(const char *string, size_t len)
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
- * @len: length of string (not including trailing null)
* @string: string to be logged
+ * @len: length of string (not including trailing null)
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
@@ -2140,34 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-int audit_log_task_context(struct audit_buffer *ab)
+/**
+ * audit_buffer_aux_new - Add an aux record buffer to the skb list
+ * @ab: audit_buffer
+ * @type: message type
+ *
+ * Aux records are allocated and added to the skb list of
+ * the "main" record. The ab->skb is reset to point to the
+ * aux record on its creation. When the aux record in complete
+ * ab->skb has to be reset to point to the "main" record.
+ * This allows the audit_log_ functions to be ignorant of
+ * which kind of record it is logging to. It also avoids adding
+ * special data for aux records.
+ *
+ * On success ab->skb will point to the new aux record.
+ * Returns 0 on success, -ENOMEM should allocation fail.
+ */
+static int audit_buffer_aux_new(struct audit_buffer *ab, int type)
+{
+ WARN_ON(ab->skb != skb_peek(&ab->skb_list));
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask);
+ if (!ab->skb)
+ goto err;
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
+
+ return 0;
+
+err:
+ kfree_skb(ab->skb);
+ ab->skb = skb_peek(&ab->skb_list);
+ return -ENOMEM;
+}
+
+/**
+ * audit_buffer_aux_end - Switch back to the "main" record from an aux record
+ * @ab: audit_buffer
+ *
+ * Restores the "main" audit record to ab->skb.
+ */
+static void audit_buffer_aux_end(struct audit_buffer *ab)
{
- char *ctx = NULL;
- unsigned len;
+ ab->skb = skb_peek(&ab->skb_list);
+}
+
+/**
+ * audit_log_subj_ctx - Add LSM subject information
+ * @ab: audit_buffer
+ * @prop: LSM subject properties.
+ *
+ * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record.
+ */
+int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+ struct lsm_context ctx;
+ char *space = "";
int error;
- u32 sid;
+ int i;
- security_current_getsecid_subj(&sid);
- if (!sid)
+ security_current_getlsmprop_subj(prop);
+ if (!lsmprop_is_set(prop))
return 0;
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
+ if (audit_subj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+ audit_log_format(ab, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
return 0;
}
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ /* Multiple LSMs provide contexts. Include an aux record. */
+ audit_log_format(ab, " subj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_subj_secctx_cnt; i++) {
+ error = security_lsmprop_to_secctx(prop, &ctx,
+ audit_subj_lsms[i]->id);
+ if (error < 0) {
+ /*
+ * Don't print anything. An LSM like BPF could
+ * claim to support contexts, but only do so under
+ * certain conditions.
+ */
+ if (error == -EOPNOTSUPP)
+ continue;
+ if (error != -EINVAL)
+ audit_panic("error in audit_log_subj_ctx");
+ } else {
+ audit_log_format(ab, "%ssubj_%s=%s", space,
+ audit_subj_lsms[i]->name, ctx.context);
+ space = " ";
+ security_release_secctx(&ctx);
+ }
+ }
+ audit_buffer_aux_end(ab);
return 0;
error_path:
- audit_panic("error in audit_log_task_context");
+ audit_panic("error in audit_log_subj_ctx");
return error;
}
+EXPORT_SYMBOL(audit_log_subj_ctx);
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ struct lsm_prop prop;
+
+ security_current_getlsmprop_subj(&prop);
+ return audit_log_subj_ctx(ab, &prop);
+}
EXPORT_SYMBOL(audit_log_task_context);
+int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+ int i;
+ int rc;
+ int error = 0;
+ char *space = "";
+ struct lsm_context ctx;
+
+ if (audit_obj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return error;
+ }
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
+ return 0;
+ }
+ audit_log_format(ab, " obj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_obj_secctx_cnt; i++) {
+ rc = security_lsmprop_to_secctx(prop, &ctx,
+ audit_obj_lsms[i]->id);
+ if (rc < 0) {
+ audit_log_format(ab, "%sobj_%s=?", space,
+ audit_obj_lsms[i]->name);
+ if (rc != -EINVAL)
+ audit_panic("error in audit_log_obj_ctx");
+ error = rc;
+ } else {
+ audit_log_format(ab, "%sobj_%s=%s", space,
+ audit_obj_lsms[i]->name, ctx.context);
+ security_release_secctx(&ctx);
+ }
+ space = " ";
+ }
+
+ audit_buffer_aux_end(ab);
+ return error;
+
+error_path:
+ audit_panic("error in audit_log_obj_ctx");
+ return error;
+}
+
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
{
@@ -2249,7 +2475,7 @@ void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
- if (!audit_enabled || audit_dummy_context())
+ if (!audit_enabled)
return;
/* Generate log with subject, operation, outcome. */
@@ -2368,13 +2594,35 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_current_getsecid_subj(&audit_sig_sid);
+ security_current_getlsmprop_subj(&audit_sig_lsm);
}
return audit_signal_info_syscall(t);
}
/**
+ * __audit_log_end - enqueue one audit record
+ * @skb: the buffer to send
+ */
+static void __audit_log_end(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+
+ if (audit_rate_check()) {
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* queue the netlink packet */
+ skb_queue_tail(&audit_queue, skb);
+ } else {
+ audit_log_lost("rate limit exceeded");
+ kfree_skb(skb);
+ }
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
@@ -2386,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t)
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
- struct nlmsghdr *nlh;
if (!ab)
return;
- if (audit_rate_check()) {
- skb = ab->skb;
- ab->skb = NULL;
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ __audit_log_end(skb);
- /* setup the netlink header, see the comments in
- * kauditd_send_multicast_skb() for length quirks */
- nlh = nlmsg_hdr(skb);
- nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
-
- /* queue the netlink packet and poke the kauditd thread */
- skb_queue_tail(&audit_queue, skb);
- wake_up_interruptible(&kauditd_wait);
- } else
- audit_log_lost("rate limit exceeded");
+ /* poke the kauditd thread */
+ wake_up_interruptible(&kauditd_wait);
audit_buffer_free(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index c4498090a5bd..7c401729e21b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
@@ -81,7 +82,7 @@ struct audit_names {
kuid_t uid;
kgid_t gid;
dev_t rdev;
- u32 osid;
+ struct lsm_prop oprop;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
@@ -98,6 +99,12 @@ struct audit_proctitle {
char *value; /* the cmdline field */
};
+/* A timestamp/serial pair to identify an event */
+struct audit_stamp {
+ struct timespec64 ctime; /* time of syscall entry */
+ unsigned int serial; /* serial number for record */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
@@ -107,10 +114,9 @@ struct audit_context {
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
+ struct audit_stamp stamp; /* event identifier */
int major; /* syscall number */
int uring_op; /* uring operation */
- struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
@@ -132,8 +138,8 @@ struct audit_context {
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
+ /* Save things to print about task_struct */
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -143,7 +149,7 @@ struct audit_context {
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
- u32 target_sid;
+ struct lsm_prop target_ref;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -160,7 +166,7 @@ struct audit_context {
kuid_t uid;
kgid_t gid;
umode_t mode;
- u32 osid;
+ struct lsm_prop oprop;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
@@ -199,8 +205,12 @@ struct audit_context {
int argc;
} execve;
struct {
- char *name;
+ const char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
@@ -241,8 +251,6 @@ struct audit_netlink_list {
int audit_send_list_thread(void *_dest);
-extern int selinux_audit_rule_update(void);
-
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
@@ -257,10 +265,10 @@ extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
/* audit watch/mark/tree functions */
-#ifdef CONFIG_AUDITSYSCALL
extern unsigned int audit_serial(void);
+#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial);
+ struct audit_stamp *stamp);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
@@ -301,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
-#define auditsc_get_stamp(c, t, s) 0
+#define auditsc_get_stamp(c, s) 0
#define audit_put_watch(w) do { } while (0)
#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -332,7 +340,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED
+#define audit_filter_inodes(t, c) do { } while (0)
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 02348b48447c..b92805b317a2 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
struct audit_fsnotify_mark *audit_mark;
struct path path;
struct dentry *dentry;
- struct inode *inode;
int ret;
if (pathname[0] != '/' || pathname[len-1] == '/')
return ERR_PTR(-EINVAL);
- dentry = kern_path_locked(pathname, &path);
+ dentry = kern_path_parent(pathname, &path);
if (IS_ERR(dentry))
return ERR_CAST(dentry); /* returning an error */
- inode = path.dentry->d_inode;
- inode_unlock(inode);
+ if (d_really_is_negative(dentry)) {
+ audit_mark = ERR_PTR(-ENOENT);
+ goto out;
+ }
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
@@ -100,8 +101,9 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0);
if (ret < 0) {
+ audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
@@ -181,7 +183,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
static int __init audit_fsnotify_init(void)
{
- audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+ FSNOTIFY_GROUP_DUPS);
if (IS_ERR(audit_fsnotify_group)) {
audit_fsnotify_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e7315d487163..fda6beb041e0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -34,7 +34,7 @@ struct audit_chunk {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
- } owners[];
+ } owners[] __counted_by(count);
};
struct audit_tree_mark {
@@ -87,14 +87,16 @@ static struct task_struct *prune_thread;
* that makes a difference. Some.
*/
-static struct fsnotify_group *audit_tree_group;
-static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
+ size_t sz;
- tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
+ sz = strlen(s) + 1;
+ tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s)
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
- strcpy(tree->pathname, s);
+ strscpy(tree->pathname, s, sz);
}
return tree;
}
@@ -351,7 +353,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
struct audit_chunk *new;
int size;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
/*
* mark_mutex stabilizes chunk attached to the mark so we can check
* whether it didn't change while we've dropped hash_lock.
@@ -368,7 +370,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
fsnotify_free_mark(mark);
return;
@@ -385,12 +387,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
*/
replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
return;
out_mutex:
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
}
/* Call with group->mark_mutex held, releases it */
@@ -400,19 +402,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
return -ENOMEM;
}
mark = alloc_mark();
if (!mark) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
kfree(chunk);
return -ENOMEM;
}
if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return -ENOSPC;
@@ -422,7 +424,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
if (tree->goner) {
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
kfree(chunk);
@@ -444,7 +446,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
*/
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
/*
* Drop our initial reference. When mark we point to is getting freed,
* we get notification through ->freeing_mark callback and cleanup
@@ -462,8 +464,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_node *p;
int n;
- mutex_lock(&audit_tree_group->mark_mutex);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+ fsnotify_group_lock(audit_tree_group);
+ mark = fsnotify_find_inode_mark(inode, audit_tree_group);
if (!mark)
return create_chunk(inode, tree);
@@ -478,7 +480,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return 0;
}
@@ -487,7 +489,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return -ENOMEM;
}
@@ -495,7 +497,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
@@ -515,7 +517,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
*/
replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
audit_mark_put_chunk(old);
@@ -668,12 +670,6 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
-static int compare_root(struct vfsmount *mnt, void *arg)
-{
- return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
- (unsigned long)arg;
-}
-
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -683,8 +679,9 @@ void audit_trim_trees(void)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
- struct vfsmount *root_mnt;
struct audit_node *node;
+ const struct path *paths;
+ struct path array[16];
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -696,9 +693,9 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(root_mnt))
+ if (IS_ERR(paths))
goto skip_it;
spin_lock(&hash_lock);
@@ -706,14 +703,17 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- if (iterate_mounts(compare_root,
- (void *)(chunk->key),
- root_mnt))
- node->index &= ~(1U<<31);
+ for (const struct path *p = paths; p->dentry; p++) {
+ struct inode *inode = p->dentry->d_inode;
+ if (inode_to_key(inode) == chunk->key) {
+ node->index &= ~(1U<<31);
+ break;
+ }
+ }
}
spin_unlock(&hash_lock);
trim_marked(tree);
- drop_collected_mounts(root_mnt);
+ drop_collected_paths(paths, array);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -742,9 +742,14 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mount(struct vfsmount *mnt, void *arg)
+static int tag_mounts(const struct path *paths, struct audit_tree *tree)
{
- return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+ for (const struct path *p = paths; p->dentry; p++) {
+ int err = tag_chunk(p->dentry->d_inode, tree);
+ if (err)
+ return err;
+ }
+ return 0;
}
/*
@@ -801,7 +806,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt;
+ struct path array[16];
+ const struct path *paths;
int err;
rule->tree = NULL;
@@ -828,16 +834,16 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
+ if (IS_ERR(paths)) {
+ err = PTR_ERR(paths);
goto Err;
}
get_tree(tree);
- err = iterate_mounts(tag_mount, tree, mnt);
- drop_collected_mounts(mnt);
+ err = tag_mounts(paths, tree);
+ drop_collected_paths(paths, array);
if (!err) {
struct audit_node *node;
@@ -872,20 +878,21 @@ int audit_tag_tree(char *old, char *new)
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
- struct vfsmount *tagged;
+ struct path array[16];
+ const struct path *paths;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(&path2);
+ paths = collect_paths(&path2, array, 16);
path_put(&path2);
- if (IS_ERR(tagged))
- return PTR_ERR(tagged);
+ if (IS_ERR(paths))
+ return PTR_ERR(paths);
err = kern_path(old, 0, &path1);
if (err) {
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return err;
}
@@ -914,7 +921,7 @@ int audit_tag_tree(char *old, char *new)
continue;
}
- failed = iterate_mounts(tag_mount, tree, tagged);
+ failed = tag_mounts(paths, tree);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -955,7 +962,7 @@ int audit_tag_tree(char *old, char *new)
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return failed;
}
@@ -1044,12 +1051,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
{
struct audit_chunk *chunk;
- mutex_lock(&mark->group->mark_mutex);
+ fsnotify_group_lock(mark->group);
spin_lock(&hash_lock);
chunk = mark_chunk(mark);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- mutex_unlock(&mark->group->mark_mutex);
+ fsnotify_group_unlock(mark->group);
if (chunk) {
evict_chunk(chunk);
audit_mark_put_chunk(chunk);
@@ -1074,7 +1081,7 @@ static int __init audit_tree_init(void)
audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 713b256be944..a700e3c8925f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -90,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
struct audit_parent *parent = NULL;
struct fsnotify_mark *entry;
- entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
+ entry = fsnotify_find_inode_mark(inode, audit_watch_group);
if (entry)
parent = container_of(entry, struct audit_parent, mark);
@@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
@@ -347,15 +347,18 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct dentry *d = kern_path_locked(watch->path, parent);
+ struct dentry *d;
+
+ d = kern_path_parent(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
+
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
- inode_unlock(d_backing_inode(parent->dentry));
+
dput(d);
return 0;
}
@@ -493,7 +496,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
static int __init audit_watch_init(void)
{
- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
@@ -527,11 +530,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- exe_file = get_task_exe_file(tsk);
+ /* only do exe filtering if we are recording @current events/records */
+ if (tsk != current)
+ return 0;
+
+ if (!current->mm)
+ return 0;
+ exe_file = get_mm_exe_file(current->mm);
if (!exe_file)
return 0;
ino = file_inode(exe_file)->i_ino;
dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
+
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 42d99896e7a6..6a86c0683b67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -221,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry)
entry->rule.mask));
}
- switch(audit_classify_arch(arch->val)) {
+ switch (audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
@@ -243,7 +243,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
- switch(listnr) {
+ switch (listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
@@ -344,7 +344,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
- switch(f->type) {
+ switch (f->type) {
case AUDIT_FSTYPE:
case AUDIT_FILTERKEY:
break;
@@ -529,7 +529,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f_val;
f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
- (void **)&f->lsm_rule);
+ (void **)&f->lsm_rule,
+ GFP_KERNEL);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
@@ -637,10 +638,9 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
+ data = kzalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
- memset(data, 0, sizeof(*data));
data->flags = krule->flags | krule->listnr;
data->action = krule->action;
@@ -651,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
- switch(f->type) {
+ switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -694,7 +694,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = f->val;
}
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ data->mask[i] = krule->mask[i];
return data;
}
@@ -717,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
a->fields[i].op != b->fields[i].op)
return 1;
- switch(a->fields[i].type) {
+ switch (a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -787,7 +788,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
static inline int audit_dupe_lsm_field(struct audit_field *df,
struct audit_field *sf)
{
- int ret = 0;
+ int ret;
char *lsm_str;
/* our own copy of lsm_str */
@@ -798,7 +799,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* our own (refreshed) copy of lsm_rule */
ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
- (void **)&df->lsm_rule);
+ (void **)&df->lsm_rule, GFP_KERNEL);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
@@ -946,7 +947,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1029,7 +1030,7 @@ int audit_del_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1083,7 +1084,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
@@ -1317,13 +1318,20 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par
if (pathlen < dlen)
return 1;
- parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
- if (pathlen - parentlen != dlen)
- return 1;
+ if (parentlen == AUDIT_NAME_FULL)
+ parentlen = parent_len(path);
p = path + parentlen;
- return strncmp(p, dname->name, dlen);
+ /* handle trailing slashes */
+ pathlen -= parentlen;
+ while (pathlen > 0 && p[pathlen - 1] == '/')
+ pathlen--;
+
+ if (pathlen != dlen)
+ return 1;
+
+ return memcmp(p, dname->name, dlen);
}
int audit_filter(int msgtype, unsigned int listtype)
@@ -1337,12 +1345,12 @@ int audit_filter(int msgtype, unsigned int listtype)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
+ struct lsm_prop prop = { };
pid_t pid;
- u32 sid;
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(current);
+ pid = task_tgid_nr(current);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
@@ -1368,9 +1376,10 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_current_getsecid_subj(&sid);
- result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule);
+ security_current_getlsmprop_subj(&prop);
+ result = security_audit_rule_match(
+ &prop, f->type, f->op,
+ f->lsm_rule);
}
break;
case AUDIT_EXE:
@@ -1430,7 +1439,7 @@ static int update_lsm_rule(struct audit_krule *r)
}
/* This function will re-initialize the lsm_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain LSM
+ * It will traverse the filter lists searching for rules that contain LSM
* specific filter fields. When such a rule is found, it is copied, the
* LSM field is re-initialized, and the old rule is replaced with the
* updated rule. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fce5d43a933f..dd0563a8e0be 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -99,7 +100,7 @@ struct audit_aux_data_pids {
kuid_t target_auid[AUDIT_AUX_PIDS];
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
- u32 target_sid[AUDIT_AUX_PIDS];
+ struct lsm_prop target_ref[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -142,6 +143,8 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
{ AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
{ AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" },
+ { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" },
{ AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
@@ -185,7 +188,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
case AUDITSC_OPENAT2:
- return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags);
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -467,7 +470,7 @@ static int audit_filter_rules(struct task_struct *tsk,
{
const struct cred *cred;
int i, need_sid = 1;
- u32 sid;
+ struct lsm_prop prop = { };
unsigned int sessionid;
if (ctx && rule->prio <= ctx->prio)
@@ -671,14 +674,16 @@ static int audit_filter_rules(struct task_struct *tsk,
* fork()/copy_process() in which case
* the new @tsk creds are still a dup
* of @current's creds so we can still
- * use security_current_getsecid_subj()
+ * use
+ * security_current_getlsmprop_subj()
* here even though it always refs
* @current's creds
*/
- security_current_getsecid_subj(&sid);
+ security_current_getlsmprop_subj(&prop);
need_sid = 0;
}
- result = security_audit_rule_match(sid, f->type,
+ result = security_audit_rule_match(&prop,
+ f->type,
f->op,
f->lsm_rule);
}
@@ -694,14 +699,14 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid,
+ &name->oprop,
f->type,
f->op,
f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (security_audit_rule_match(
- n->osid,
+ &n->oprop,
f->type,
f->op,
f->lsm_rule)) {
@@ -713,7 +718,7 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find ipc objects that match */
if (!ctx || ctx->type != AUDIT_IPC)
break;
- if (security_audit_rule_match(ctx->ipc.osid,
+ if (security_audit_rule_match(&ctx->ipc.oprop,
f->type, f->op,
f->lsm_rule))
++result;
@@ -806,30 +811,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
}
/**
- * audit_filter_uring - apply filters to an io_uring operation
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
* @tsk: associated task
* @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static void audit_filter_uring(struct task_struct *tsk,
- struct audit_context *ctx)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
- list) {
- if (audit_in_mask(&e->rule, ctx->uring_op) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
- false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
rcu_read_unlock();
}
@@ -841,24 +869,13 @@ static void audit_filter_uring(struct task_struct *tsk,
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
- struct audit_entry *e;
- enum audit_state state;
-
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return;
}
/*
@@ -867,20 +884,12 @@ static void audit_filter_syscall(struct task_struct *tsk,
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
- struct audit_context *ctx) {
+ struct audit_context *ctx)
+{
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -965,7 +974,7 @@ static void audit_reset_context(struct audit_context *ctx)
if (!ctx)
return;
- /* if ctx is non-null, reset the "ctx->state" regardless */
+ /* if ctx is non-null, reset the "ctx->context" regardless */
ctx->context = AUDIT_CTX_UNUSED;
if (ctx->dummy)
return;
@@ -985,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx)
*/
ctx->current_state = ctx->state;
- ctx->serial = 0;
+ ctx->stamp.serial = 0;
+ ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
ctx->major = 0;
ctx->uring_op = 0;
- ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
memset(ctx->argv, 0, sizeof(ctx->argv));
ctx->return_code = 0;
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
@@ -1002,7 +1011,7 @@ static void audit_reset_context(struct audit_context *ctx)
kfree(ctx->sockaddr);
ctx->sockaddr = NULL;
ctx->sockaddr_len = 0;
- ctx->pid = ctx->ppid = 0;
+ ctx->ppid = 0;
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
ctx->personality = 0;
@@ -1010,14 +1019,13 @@ static void audit_reset_context(struct audit_context *ctx)
ctx->target_pid = 0;
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
ctx->target_sessionid = 0;
- ctx->target_sid = 0;
+ lsmprop_init(&ctx->target_ref);
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
- ctx->type = 0;
audit_free_module(ctx);
ctx->fds[0] = -1;
- audit_proctitle_free(ctx);
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -1061,7 +1069,8 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
- if (!(context = audit_alloc_context(state))) {
+ context = audit_alloc_context(state);
+ if (!context) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
@@ -1073,47 +1082,22 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
-/**
- * audit_alloc_kernel - allocate an audit_context for a kernel task
- * @tsk: the kernel task
- *
- * Similar to the audit_alloc() function, but intended for kernel private
- * threads. Returns zero on success, negative values on failure.
- */
-int audit_alloc_kernel(struct task_struct *tsk)
-{
- /*
- * At the moment we are just going to call into audit_alloc() to
- * simplify the code, but there two things to keep in mind with this
- * approach:
- *
- * 1. Filtering internal kernel tasks is a bit laughable in almost all
- * cases, but there is at least one case where there is a benefit:
- * the '-a task,never' case allows the admin to effectively disable
- * task auditing at runtime.
- *
- * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
- * on these internal kernel tasks, but they probably don't hurt either.
- */
- return audit_alloc(tsk);
-}
-
static inline void audit_free_context(struct audit_context *context)
{
/* resetting is extra work, but it is likely just noise */
audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
kfree(context->filterkey);
kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- kuid_t auid, kuid_t uid, unsigned int sessionid,
- u32 sid, char *comm)
+ kuid_t auid, kuid_t uid,
+ unsigned int sessionid, struct lsm_prop *prop,
+ char *comm)
{
struct audit_buffer *ab;
- char *ctx = NULL;
- u32 len;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1123,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (sid) {
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop))
+ rc = 1;
+
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
audit_log_end(ab);
@@ -1316,15 +1294,11 @@ out:
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
- int i;
-
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
@@ -1340,6 +1314,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1358,24 +1379,14 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, " a%d=%lx", i,
context->socketcall.args[i]);
break; }
- case AUDIT_IPC: {
- u32 osid = context->ipc.osid;
-
+ case AUDIT_IPC:
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
from_kuid(&init_user_ns, context->ipc.uid),
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
- if (osid) {
- char *ctx = NULL;
- u32 len;
-
- if (security_secid_to_secctx(osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", osid);
+ if (lsmprop_is_set(&context->ipc.oprop)) {
+ if (audit_log_obj_ctx(ab, &context->ipc.oprop))
*call_panic = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
}
if (context->ipc.has_perm) {
audit_log_end(ab);
@@ -1390,7 +1401,7 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_gid,
context->ipc.perm_mode);
}
- break; }
+ break;
case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
@@ -1454,6 +1465,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -1527,20 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
-
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (lsmprop_is_set(&n->oprop) &&
+ audit_log_obj_ctx(ab, &n->oprop))
+ *call_panic = 2;
/* log the audit_names record type */
switch (n->type) {
@@ -1622,8 +1627,8 @@ static void audit_log_uring(struct audit_context *ctx)
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (ctx->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(ctx->return_valid ==
+ AUDITSC_SUCCESS),
ctx->return_code);
audit_log_format(ab,
" items=%d"
@@ -1665,8 +1670,8 @@ static void audit_log_exit(void)
audit_log_format(ab, " per=%lx", context->personality);
if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(context->return_valid ==
+ AUDITSC_SUCCESS),
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
@@ -1749,7 +1754,7 @@ static void audit_log_exit(void)
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
- axs->target_sid[i],
+ &axs->target_ref[i],
axs->target_comm[i]))
call_panic = 1;
}
@@ -1758,8 +1763,9 @@ static void audit_log_exit(void)
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- context->target_sid, context->target_comm))
- call_panic = 1;
+ &context->target_ref,
+ context->target_comm))
+ call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
@@ -1806,7 +1812,7 @@ void __audit_free(struct task_struct *tsk)
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
- * random task_struct that doesn't doesn't have any meaningful data we
+ * random task_struct that doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy) {
@@ -1890,7 +1896,7 @@ void __audit_uring_entry(u8 op)
ctx->context = AUDIT_CTX_URING;
ctx->current_state = ctx->state;
- ktime_get_coarse_real_ts64(&ctx->ctime);
+ ktime_get_coarse_real_ts64(&ctx->stamp.ctime);
}
/**
@@ -1907,6 +1913,13 @@ void __audit_uring_exit(int success, long code)
{
struct audit_context *ctx = audit_context();
+ if (ctx->dummy) {
+ if (ctx->context != AUDIT_CTX_URING)
+ return;
+ goto out;
+ }
+
+ audit_return_fixup(ctx, success, code);
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
@@ -1948,7 +1961,6 @@ void __audit_uring_exit(int success, long code)
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(ctx, success, code);
audit_log_exit();
out:
@@ -2006,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[3] = a4;
context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- ktime_get_coarse_real_ts64(&context->ctime);
+ ktime_get_coarse_real_ts64(&context->stamp.ctime);
}
/**
@@ -2032,13 +2044,13 @@ void __audit_syscall_exit(int success, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
+ audit_return_fixup(context, success, return_code);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
- if (context->current_state < AUDIT_STATE_RECORD)
+ if (context->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(context, success, return_code);
audit_log_exit();
out:
@@ -2091,7 +2103,7 @@ retry:
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
- for(;;) {
+ for (;;) {
struct inode *inode = d_backing_inode(d);
if (inode && unlikely(inode->i_fsnotify_marks)) {
@@ -2174,10 +2186,8 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- n->name->refcnt++;
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
@@ -2204,7 +2214,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- name->refcnt++;
+ refname(name);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2216,7 +2226,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -2241,7 +2251,7 @@ static void audit_copy_inode(struct audit_names *name,
name->uid = inode->i_uid;
name->gid = inode->i_gid;
name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
+ security_inode_getlsmprop(inode, &name->oprop);
if (flags & AUDIT_INODE_NOEVAL) {
name->fcap_ver = -1;
return;
@@ -2336,7 +2346,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- name->refcnt++;
+ refname(name);
}
out:
@@ -2406,39 +2416,36 @@ void __audit_inode_child(struct inode *parent,
if (inode)
handle_one(inode);
- /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name ||
- (n->type != AUDIT_TYPE_PARENT &&
- n->type != AUDIT_TYPE_UNKNOWN))
+ /* can only match entries that have a name */
+ if (!n->name)
continue;
- if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
- !audit_compare_dname_path(dname,
- n->name->name, n->name_len)) {
- if (n->type == AUDIT_TYPE_UNKNOWN)
- n->type = AUDIT_TYPE_PARENT;
+ /* look for a parent entry first */
+ if (!found_parent &&
+ (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname, n->name->name, n->name_len))) {
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
- break;
- }
- }
-
- /* is there a matching child entry? */
- list_for_each_entry(n, &context->names_list, list) {
- /* can only match entries that have a name */
- if (!n->name ||
- (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
+ if (found_child)
+ break;
continue;
+ }
- if (!strcmp(dname->name, n->name->name) ||
- !audit_compare_dname_path(dname, n->name->name,
+ /* is there a matching child entry? */
+ if (!found_child &&
+ (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (!strcmp(dname->name, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
- AUDIT_NAME_FULL)) {
+ AUDIT_NAME_FULL))) {
if (n->type == AUDIT_TYPE_UNKNOWN)
n->type = type;
found_child = n;
- break;
+ if (found_parent)
+ break;
}
}
@@ -2461,7 +2468,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- found_child->name->refcnt++;
+ refname(found_child->name);
}
}
@@ -2475,21 +2482,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
- * @t: timespec64 to store time recorded in the audit_context
- * @serial: serial value that is recorded in the audit_context
+ * @stamp: timestamp to record
*
* Also sets the context as auditable.
*/
-int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp)
{
if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
- if (!ctx->serial)
- ctx->serial = audit_serial();
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
+ if (!ctx->stamp.serial)
+ ctx->stamp.serial = audit_serial();
+ *stamp = ctx->stamp;
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_STATE_RECORD;
@@ -2593,7 +2596,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
context->ipc.has_perm = 0;
- security_ipc_getsecid(ipcp, &context->ipc.osid);
+ security_ipc_getlsmprop(ipcp, &context->ipc.oprop);
context->type = AUDIT_IPC;
}
@@ -2690,8 +2693,8 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &context->target_sid);
- memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
+ strscpy(context->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &context->target_ref);
}
/**
@@ -2717,8 +2720,8 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &ctx->target_sid);
- memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
+ strscpy(ctx->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &ctx->target_ref);
return 0;
}
@@ -2738,8 +2741,8 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
- memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
+ security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]);
+ strscpy(axp->target_comm[axp->pid_count], t->comm);
axp->pid_count++;
return 0;
@@ -2771,7 +2774,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2831,7 +2834,7 @@ void __audit_openat2_how(struct open_how *how)
context->type = AUDIT_OPENAT2;
}
-void __audit_log_kern_module(char *name)
+void __audit_log_kern_module(const char *name)
{
struct audit_context *context = audit_context();
@@ -2841,39 +2844,45 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
@@ -2888,7 +2897,7 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
name, af, nentries, audit_nfcfgs[op].s);
- audit_log_format(ab, " pid=%u", task_pid_nr(current));
+ audit_log_format(ab, " pid=%u", task_tgid_nr(current));
audit_log_task_context(ab); /* subj= */
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 370217dd7e39..2dfe66b9ed76 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -21,24 +21,20 @@ static void backtrace_test_normal(void)
dump_stack();
}
-static DECLARE_COMPLETION(backtrace_work);
-
-static void backtrace_test_irq_callback(unsigned long data)
+static void backtrace_test_bh_workfn(struct work_struct *work)
{
dump_stack();
- complete(&backtrace_work);
}
-static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
+static DECLARE_WORK(backtrace_bh_work, &backtrace_test_bh_workfn);
-static void backtrace_test_irq(void)
+static void backtrace_test_bh(void)
{
- pr_info("Testing a backtrace from irq context.\n");
+ pr_info("Testing a backtrace from BH context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- init_completion(&backtrace_work);
- tasklet_schedule(&backtrace_tasklet);
- wait_for_completion(&backtrace_work);
+ queue_work(system_bh_wq, &backtrace_bh_work);
+ flush_work(&backtrace_bh_work);
}
#ifdef CONFIG_STACKTRACE
@@ -65,7 +61,7 @@ static int backtrace_regression_test(void)
pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
- backtrace_test_irq();
+ backtrace_test_bh();
backtrace_test_saved();
pr_info("====[ end of backtrace testing ]====\n");
@@ -78,5 +74,6 @@ static void exitf(void)
module_init(backtrace_regression_test);
module_exit(exitf);
+MODULE_DESCRIPTION("Simple stack backtrace regression test module");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..02b619eb6106 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -6,6 +6,7 @@
*/
#define __GENERATING_BOUNDS_H
+#define COMPILE_OFFSETS
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
@@ -19,9 +20,16 @@ int main(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, order_base_2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d24d518ddd63..eb3de35734f0 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -3,6 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
+ select CRYPTO_LIB_SHA256
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
@@ -27,9 +28,12 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
+ select NEED_TASKS_RCU
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select NET_XGRESS if NET
+ select PAGE_POOL if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
@@ -39,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
- depends on MODULES
+ select EXECMEM
help
BPF programs are normally handled by a BPF interpreter. This option
allows the kernel to generate native code when a program is loaded
@@ -58,6 +62,10 @@ config BPF_JIT_ALWAYS_ON
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c1a9be6a4b9f..232cbc97434d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,26 +4,34 @@ ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
-CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
+CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
-obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
+endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
+obj-$(CONFIG_BPF_SYSCALL) += tcx.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
@@ -33,10 +41,25 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+ifneq ($(CONFIG_CRYPTO),)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
-$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
- $(call if_changed_rule,cc_o_c)
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
+obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
+endif
+
+CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644
index 000000000000..872dc0e41c65
--- /dev/null
+++ b/kernel/bpf/arena.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include "linux/filter.h"
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include "range_tree.h"
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000 // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ * mov eax, eax // eax has lower 32-bit of user pointer
+ * mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
+#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
+
+struct bpf_arena {
+ struct bpf_map map;
+ u64 user_vm_start;
+ u64 user_vm_end;
+ struct vm_struct *kern_vm;
+ struct range_tree rt;
+ struct list_head vma_list;
+ struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+ return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+ struct vm_struct *kern_vm;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_arena *arena;
+ u64 vm_range;
+ int err = -ENOMEM;
+
+ if (!bpf_jit_supports_arena())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+ /* BPF_F_MMAPABLE must be set */
+ !(attr->map_flags & BPF_F_MMAPABLE) ||
+ /* No unsupported flags present */
+ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->map_extra & ~PAGE_MASK)
+ /* If non-zero the map_extra is an expected user VMA start address */
+ return ERR_PTR(-EINVAL);
+
+ vm_range = (u64)attr->max_entries * PAGE_SIZE;
+ if (vm_range > SZ_4G)
+ return ERR_PTR(-E2BIG);
+
+ if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+ /* user vma must not cross 32-bit boundary */
+ return ERR_PTR(-ERANGE);
+
+ kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+ if (!kern_vm)
+ return ERR_PTR(-ENOMEM);
+
+ arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+ if (!arena)
+ goto err;
+
+ arena->kern_vm = kern_vm;
+ arena->user_vm_start = attr->map_extra;
+ if (arena->user_vm_start)
+ arena->user_vm_end = arena->user_vm_start + vm_range;
+
+ INIT_LIST_HEAD(&arena->vma_list);
+ bpf_map_init_from_attr(&arena->map, attr);
+ range_tree_init(&arena->rt);
+ err = range_tree_set(&arena->rt, 0, attr->max_entries);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
+ mutex_init(&arena->lock);
+
+ return &arena->map;
+err:
+ free_vm_area(kern_vm);
+ return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t pte;
+
+ pte = ptep_get(ptep);
+ if (!pte_present(pte)) /* sanity check */
+ return 0;
+ page = pte_page(pte);
+ /*
+ * We do not update pte here:
+ * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+ * 2. TLB flushing is batched or deferred. Even if we clear pte,
+ * the TLB entries can stick around and continue to permit access to
+ * the freed page. So it all relies on 1.
+ */
+ __free_page(page);
+ return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ /*
+ * Check that user vma-s are not around when bpf map is freed.
+ * mmap() holds vm_file which holds bpf_map refcnt.
+ * munmap() must have happened on vma followed by arena_vm_close()
+ * which would clear arena->vma_list.
+ */
+ if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+ return;
+
+ /*
+ * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+ * It unmaps everything from vmalloc area and clears pgtables.
+ * Call apply_to_existing_page_range() first to find populated ptes and
+ * free those pages.
+ */
+ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ free_vm_area(arena->kern_vm);
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+ return 0;
+}
+
+struct vma_list {
+ struct vm_area_struct *vma;
+ struct list_head head;
+ refcount_t mmap_count;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+ struct vma_list *vml;
+
+ vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+ if (!vml)
+ return -ENOMEM;
+ refcount_set(&vml->mmap_count, 1);
+ vma->vm_private_data = vml;
+ vml->vma = vma;
+ list_add(&vml->head, &arena->vma_list);
+ return 0;
+}
+
+static void arena_vm_open(struct vm_area_struct *vma)
+{
+ struct vma_list *vml = vma->vm_private_data;
+
+ refcount_inc(&vml->mmap_count);
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct vma_list *vml = vma->vm_private_data;
+
+ if (!refcount_dec_and_test(&vml->mmap_count))
+ return;
+ guard(mutex)(&arena->lock);
+ /* update link list under lock */
+ list_del(&vml->head);
+ vma->vm_private_data = NULL;
+ kfree(vml);
+}
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+ struct bpf_map *map = vmf->vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct page *page;
+ long kbase, kaddr;
+ int ret;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+ kaddr = kbase + (u32)(vmf->address);
+
+ guard(mutex)(&arena->lock);
+ page = vmalloc_to_page((void *)kaddr);
+ if (page)
+ /* already have a page vmap-ed */
+ goto out;
+
+ if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+ /* User space requested to segfault when page is not allocated by bpf prog */
+ return VM_FAULT_SIGSEGV;
+
+ ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
+ if (ret)
+ return VM_FAULT_SIGSEGV;
+
+ /* Account into memcg of the process that created bpf_arena */
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ return VM_FAULT_SIGSEGV;
+ }
+
+ ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ __free_page(page);
+ return VM_FAULT_SIGSEGV;
+ }
+out:
+ page_ref_add(page, 1);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+ .open = arena_vm_open,
+ .close = arena_vm_close,
+ .fault = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ long ret;
+
+ if (pgoff)
+ return -EINVAL;
+ if (len > SZ_4G)
+ return -E2BIG;
+
+ /* if user_vm_start was specified at arena creation time */
+ if (arena->user_vm_start) {
+ if (len > arena->user_vm_end - arena->user_vm_start)
+ return -E2BIG;
+ if (len != arena->user_vm_end - arena->user_vm_start)
+ return -EINVAL;
+ if (addr != arena->user_vm_start)
+ return -EINVAL;
+ }
+
+ ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags);
+ if (IS_ERR_VALUE(ret))
+ return ret;
+ if ((ret >> 32) == ((ret + len - 1) >> 32))
+ return ret;
+ if (WARN_ON_ONCE(arena->user_vm_start))
+ /* checks at map creation time should prevent this */
+ return -EFAULT;
+ return round_up(ret, SZ_4G);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ guard(mutex)(&arena->lock);
+ if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+ /*
+ * If map_extra was not specified at arena creation time then
+ * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+ * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+ * or
+ * specify addr in map_extra and
+ * use the same addr later with mmap(addr, MAP_FIXED..);
+ */
+ return -EBUSY;
+
+ if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+ /* all user processes must have the same size of mmap-ed region */
+ return -EBUSY;
+
+ /* Earlier checks should prevent this */
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
+ return -EFAULT;
+
+ if (remember_vma(arena, vma))
+ return -ENOMEM;
+
+ arena->user_vm_start = vma->vm_start;
+ arena->user_vm_end = vma->vm_end;
+ /*
+ * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+ * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+ * potential change of user_vm_start.
+ */
+ vm_flags_set(vma, VM_DONTEXPAND);
+ vma->vm_ops = &arena_vm_ops;
+ return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+ return -ERANGE;
+ *imm = (unsigned long)arena->user_vm_start;
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = arena_map_alloc,
+ .map_free = arena_map_free,
+ .map_direct_value_addr = arena_map_direct_value_addr,
+ .map_mmap = arena_map_mmap,
+ .map_get_unmapped_area = arena_get_unmapped_area,
+ .map_get_next_key = arena_map_get_next_key,
+ .map_push_elem = arena_map_push_elem,
+ .map_peek_elem = arena_map_peek_elem,
+ .map_pop_elem = arena_map_pop_elem,
+ .map_lookup_elem = arena_map_lookup_elem,
+ .map_update_elem = arena_map_update_elem,
+ .map_delete_elem = arena_map_delete_elem,
+ .map_check_btf = arena_map_check_btf,
+ .map_mem_usage = arena_map_mem_usage,
+ .map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+ return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+ /* user_vm_end/start are fixed before bpf prog runs */
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+ struct page **pages;
+ long pgoff = 0;
+ u32 uaddr32;
+ int ret, i;
+
+ if (page_cnt > page_cnt_max)
+ return 0;
+
+ if (uaddr) {
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff > page_cnt_max - page_cnt)
+ /* requested address will be outside of user VMA */
+ return 0;
+ }
+
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ guard(mutex)(&arena->lock);
+
+ if (uaddr) {
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ goto out_free_pages;
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ } else {
+ ret = pgoff = range_tree_find(&arena->rt, page_cnt);
+ if (pgoff >= 0)
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ }
+ if (ret)
+ goto out_free_pages;
+
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
+ if (ret)
+ goto out;
+
+ uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+ /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
+ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+ kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+ if (ret) {
+ for (i = 0; i < page_cnt; i++)
+ __free_page(pages[i]);
+ goto out;
+ }
+ kvfree(pages);
+ return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+out_free_pages:
+ kvfree(pages);
+ return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ struct vma_list *vml;
+
+ list_for_each_entry(vml, &arena->vma_list, head)
+ zap_page_range_single(vml->vma, uaddr,
+ PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ u64 full_uaddr, uaddr_end;
+ long kaddr, pgoff, i;
+ struct page *page;
+
+ /* only aligned lower 32-bit are relevant */
+ uaddr = (u32)uaddr;
+ uaddr &= PAGE_MASK;
+ full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+ uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+ if (full_uaddr >= uaddr_end)
+ return;
+
+ page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+ guard(mutex)(&arena->lock);
+
+ pgoff = compute_pgoff(arena, uaddr);
+ /* clear range */
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+
+ if (page_cnt > 1)
+ /* bulk zap if multiple pages being freed */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+ for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+ page = vmalloc_to_page((void *)kaddr);
+ if (!page)
+ continue;
+ if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ /* Optimization for the common case of page_cnt==1:
+ * If page wasn't mapped into some user vma there
+ * is no need to call zap_pages which is slow. When
+ * page_cnt is big it's faster to do the batched zap.
+ */
+ zap_pages(arena, full_uaddr, 1);
+ vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+ __free_page(page);
+ }
+}
+
+/*
+ * Reserve an arena virtual address range without populating it. This call stops
+ * bpf_arena_alloc_pages from adding pages to this range.
+ */
+static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
+{
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ long pgoff;
+ int ret;
+
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff + page_cnt > page_cnt_max)
+ return -EINVAL;
+
+ guard(mutex)(&arena->lock);
+
+ /* Cannot guard already allocated pages. */
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ return -EBUSY;
+
+ /* "Allocate" the region to prevent it from being allocated. */
+ return range_tree_clear(&arena->rt, pgoff, page_cnt);
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+
+__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA)
+ return -EINVAL;
+
+ if (!page_cnt)
+ return 0;
+
+ return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+ u64 user_vm_start;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+
+ /* Use main prog for stream access */
+ prog = prog->aux->main_prog_aux->prog;
+
+ user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+ addr += clear_lo32(user_vm_start);
+
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n",
+ write ? "WRITE" : "READ", addr);
+ bpf_stream_dump_stack(ss);
+ }));
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c7a5be3bf8be..1eeb31c5b317 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,6 +11,8 @@
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
+#include <crypto/sha2.h>
#include "map_in_map.h"
@@ -69,10 +71,11 @@ int array_map_alloc_check(union bpf_attr *attr)
attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
- if (attr->value_size > KMALLOC_MAX_SIZE)
- /* if value_size is bigger, the user space won't be able to
- * access the elements.
- */
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
+ return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
return -E2BIG;
return 0;
@@ -83,7 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
u64 array_size, mask64;
struct bpf_array *array;
@@ -155,6 +158,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -164,7 +172,18 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
+}
+
+static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
+ void *hash_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ sha256(array->value, (u64)array->elem_size * array->map.max_entries,
+ hash_buf);
+ memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ return 0;
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
@@ -202,7 +221,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
@@ -242,6 +261,52 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
+static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
+ }
+
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
+ return insn - insn_buf;
+}
+
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -257,11 +322,12 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -269,33 +335,26 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
}
/* Called from syscall */
-static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
- if (index >= array->map.max_entries) {
+ if (index >= map->max_entries) {
*next = 0;
return 0;
}
- if (index == array->map.max_entries - 1)
+ if (index == map->max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
-static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
-{
- if (unlikely(map_value_has_timer(&arr->map)))
- bpf_timer_cancel_and_free(val + arr->map.timer_off);
-}
-
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -314,20 +373,21 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
- array->elem_size * (index & array->index_mask);
+ (u64)array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- check_and_free_timer_in_array(array, val);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -359,11 +419,12 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -371,7 +432,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -381,23 +442,41 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers(struct bpf_map *map)
+static void array_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- if (likely(!map_value_has_timer(map)))
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array->value + array->elem_size * i +
- map->timer_off);
+ bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (!IS_ERR_OR_NULL(map->record)) {
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
+ }
+ }
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -424,7 +503,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
if (map->btf_key_type_id)
seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -445,7 +524,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -457,8 +536,6 @@ static int array_map_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- u32 int_data;
-
/* One exception for keyless BTF: .bss/.data/.rodata map */
if (btf_type_is_void(key_type)) {
if (map->map_type != BPF_MAP_TYPE_ARRAY ||
@@ -471,14 +548,11 @@ static int array_map_check_btf(const struct bpf_map *map,
return 0;
}
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- /* bpf array can only take a u32 key. This check makes sure
+ /*
+ * Bpf array can only take a u32 key. This check makes sure
* that the btf matches the attr used during map_create.
*/
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ if (!btf_type_is_i32(key_type))
return -EINVAL;
return 0;
@@ -530,8 +604,8 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return (void *)(uintptr_t)array->pptrs[index];
+ return array_map_elem_ptr(array, index);
}
static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -549,8 +623,8 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return (void *)(uintptr_t)array->pptrs[index];
+ return array_map_elem_ptr(array, index);
}
static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
@@ -558,10 +632,11 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
- void __percpu **pptr;
+ void __percpu *pptr;
u32 size;
meta.seq = seq;
@@ -577,12 +652,12 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
if (!info->percpu_value_buf) {
ctx.value = v;
} else {
- pptr = v;
- size = round_up(map->value_size, 8);
+ pptr = (void __percpu *)(uintptr_t)v;
+ size = array->elem_size;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += size;
}
ctx.value = info->percpu_value_buf;
@@ -608,11 +683,12 @@ static int bpf_iter_init_array_map(void *priv_data,
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
void *value_buf;
u32 buf_size;
if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ buf_size = array->elem_size * num_possible_cpus();
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
if (!value_buf)
return -ENOMEM;
@@ -620,6 +696,11 @@ static int bpf_iter_init_array_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
return 0;
}
@@ -628,6 +709,7 @@ static void bpf_iter_fini_array_map(void *priv_data)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
@@ -645,8 +727,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
@@ -654,18 +736,18 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
u64 ret = 0;
void *val;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
- if (is_percpu)
- migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
else
- val = array->value + array->elem_size * i;
+ val = array_map_elem_ptr(array, i);
num_elems++;
key = i;
ret = callback_fn((u64)(long)map, (u64)(long)&key,
@@ -675,19 +757,39 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
break;
}
- if (is_percpu)
- migrate_enable();
return num_elems;
}
-static int array_map_btf_id;
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -701,29 +803,31 @@ const struct bpf_map_ops array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
+ .map_get_hash = &array_map_get_hash,
};
-static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &percpu_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -804,11 +908,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -827,33 +931,60 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
return 0;
} else {
return -ENOENT;
}
}
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return __fd_array_map_delete_elem(map, key, true);
+}
+
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+ bool is_extended;
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (prog->type == BPF_PROG_TYPE_EXT ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+ mutex_lock(&prog->aux->ext_mutex);
+ is_extended = prog->aux->is_extended;
+ if (!is_extended)
+ prog->aux->prog_array_member_cnt++;
+ mutex_unlock(&prog->aux->ext_mutex);
+ if (is_extended) {
+ /* Extended prog can not be tail callee. It's to prevent a
+ * potential infinite loop like:
+ * tail callee prog entry -> tail callee prog subprog ->
+ * freplace prog entry --tailcall-> tail callee prog entry.
+ */
+ bpf_prog_put(prog);
+ return ERR_PTR(-EBUSY);
+ }
+
return prog;
}
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
- bpf_prog_put(ptr);
+ struct bpf_prog *prog = ptr;
+
+ mutex_lock(&prog->aux->ext_mutex);
+ prog->aux->prog_array_member_cnt--;
+ mutex_unlock(&prog->aux->ext_mutex);
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
+ bpf_prog_put(prog);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
@@ -862,13 +993,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-static void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, need_defer);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -887,7 +1018,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
}
@@ -950,11 +1081,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
mutex_unlock(&aux->poke_mutex);
}
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
- u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
@@ -963,7 +1099,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
- int i, ret;
+ int i;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
@@ -982,21 +1118,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* activated, so tail call updates can arrive from here
* while JIT is still finishing its final fixup for
* non-activated poke entries.
- * 3) On program teardown, the program's kallsym entry gets
- * removed out of RCU callback, but we can only untrack
- * from sleepable context, therefore bpf_arch_text_poke()
- * might not see that this is in BPF text section and
- * bails out with -EINVAL. As these are unreachable since
- * RCU grace period already passed, we simply skip them.
- * 4) Also programs reaching refcount of zero while patching
+ * 3) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
- * buffer is freed. When we're still in the middle of
- * patching and suddenly kallsyms entry of the program
- * gets evicted, we just skip the rest which is fine due
- * to point 3).
- * 5) Any other error happening below from bpf_arch_text_poke()
- * is a unexpected bug.
+ * buffer is freed.
*/
if (!READ_ONCE(poke->tailcall_target_stable))
continue;
@@ -1006,39 +1131,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- old_bypass_addr = old ? NULL : poke->bypass_addr;
- old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
- new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
- if (new) {
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, new_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- if (!old) {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- poke->bypass_addr,
- NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
- } else {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- old_bypass_addr,
- poke->bypass_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- /* let other CPUs finish the execution of program
- * so that it will not possible to expose them
- * to invalid nop, stack unwind, nop state
- */
- if (!ret)
- synchronize_rcu();
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
+ bpf_arch_poke_desc_update(poke, new, old);
}
}
}
@@ -1047,7 +1140,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, true);
bpf_map_put(map);
}
@@ -1071,7 +1164,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
- spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
@@ -1104,7 +1196,6 @@ static void prog_array_map_free(struct bpf_map *map)
* Thus, prog_array_map cannot be used as an inner_map
* and map_meta_equal is not implemented.
*/
-static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -1112,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
@@ -1120,8 +1211,8 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &prog_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -1129,7 +1220,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
{
struct bpf_event_entry *ee;
- ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
@@ -1179,8 +1270,9 @@ err_out:
return ee;
}
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
@@ -1198,7 +1290,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, true);
}
rcu_read_unlock();
}
@@ -1206,25 +1298,24 @@ static void perf_event_fd_array_release(struct bpf_map *map,
static void perf_event_fd_array_map_free(struct bpf_map *map)
{
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
-static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = perf_event_fd_array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &perf_event_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#ifdef CONFIG_CGROUPS
@@ -1235,7 +1326,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
return cgroup_get_from_fd(fd);
}
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
@@ -1243,24 +1334,23 @@ static void cgroup_fd_array_put_ptr(void *ptr)
static void cgroup_fd_array_free(struct bpf_map *map)
{
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
-static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &cgroup_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1289,7 +1379,7 @@ static void array_of_map_free(struct bpf_map *map)
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1307,7 +1397,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
@@ -1334,19 +1424,20 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_of_maps_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b141a1346f72..35e1ddca74d2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -7,6 +7,7 @@
#include <linux/err.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/btf_ids.h>
#define BLOOM_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
@@ -15,13 +16,6 @@ struct bpf_bloom_filter {
struct bpf_map map;
u32 bitset_mask;
u32 hash_seed;
- /* If the size of the values in the bloom filter is u32 aligned,
- * then it is more performant to use jhash2 as the underlying hash
- * function, else we use jhash. This tracks the number of u32s
- * in an u32-aligned value size. If the value size is not u32 aligned,
- * this will be 0.
- */
- u32 aligned_u32_count;
u32 nr_hash_funcs;
unsigned long bitset[];
};
@@ -31,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
{
u32 h;
- if (bloom->aligned_u32_count)
- h = jhash2(value, bloom->aligned_u32_count,
- bloom->hash_seed + index);
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
else
h = jhash(value, value_size, bloom->hash_seed + index);
return h & bloom->bitset_mask;
}
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -55,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
return 0;
}
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -72,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
return 0;
}
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
@@ -87,15 +80,24 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return -EOPNOTSUPP;
}
+/* Called from syscall */
+static int bloom_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_bloom_filter *bloom;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->key_size != 0 || attr->value_size == 0 ||
attr->max_entries == 0 ||
attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
@@ -151,13 +153,8 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
bloom->nr_hash_funcs = nr_hash_funcs;
bloom->bitset_mask = bitset_mask;
- /* Check whether the value size is u32-aligned */
- if ((attr->value_size & (sizeof(u32) - 1)) == 0)
- bloom->aligned_u32_count =
- attr->value_size / sizeof(u32);
-
if (!(attr->map_flags & BPF_F_ZERO_SEED))
- bloom->hash_seed = get_random_int();
+ bloom->hash_seed = get_random_u32();
return &bloom->map;
}
@@ -176,8 +173,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-EINVAL);
}
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
/* The eBPF program should use map_push_elem instead */
return -EINVAL;
@@ -192,9 +189,21 @@ static int bloom_map_check_btf(const struct bpf_map *map,
return btf_type_is_void(key_type) ? 0 : -EINVAL;
}
-static int bpf_bloom_map_btf_id;
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bloom_map_alloc_check,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
.map_get_next_key = bloom_map_get_next_key,
@@ -205,6 +214,6 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_update_elem = bloom_map_update_elem,
.map_delete_elem = bloom_map_delete_elem,
.map_check_btf = bloom_map_check_btf,
- .map_btf_name = "bpf_bloom_filter",
- .map_btf_id = &bpf_bloom_map_btf_id,
+ .map_mem_usage = bloom_map_mem_usage,
+ .map_btf_id = &bpf_bloom_map_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..0687a760974a
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ cant_migrate();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ cant_migrate();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock_dont_migrate();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage)
+ goto out;
+
+ bpf_cgrp_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_cgrp_storage_unlock();
+out:
+ rcu_read_unlock_migrate();
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, false, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+ return 0;
+}
+
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ bool nobusy;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_cgrp_storage_trylock();
+
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, false, gfp_flags);
+
+unlock:
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e29d9e3d853e..e54cce2b9175 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -16,7 +16,6 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -56,88 +55,50 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
- bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
- struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
return;
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(bsb->storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
- raw_spin_lock_bh(&local_storage->lock);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
- }
- raw_spin_unlock_bh(&local_storage->lock);
- rcu_read_unlock();
-
- /* free_inoode_storage should always be true as long as
- * local_storage->list was non-empty.
- */
- if (free_inode_storage)
- kfree_rcu(local_storage, rcu);
+ bpf_local_storage_destroy(local_storage);
+out:
+ rcu_read_unlock_migrate();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ CLASS(fd_raw, f)(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(f->f_inode, map, true);
- fput(f);
+ sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
return sdata ? sdata->data : NULL;
}
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ CLASS(fd_raw, f)(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (fd_empty(f))
return -EBADF;
- if (!inode_storage_ptr(f->f_inode)) {
- fput(f);
+ if (!inode_storage_ptr(file_inode(fd_file(f))))
return -EBADF;
- }
- sdata = bpf_local_storage_update(f->f_inode,
+ sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags);
- fput(f);
+ value, map_flags, false, GFP_ATOMIC);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -149,28 +110,23 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct file *f;
- int fd, err;
+ CLASS(fd_raw, f)(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (fd_empty(f))
return -EBADF;
-
- err = inode_storage_delete(f->f_inode, map);
- fput(f);
- return err;
+ return inode_storage_delete(file_inode(fd_file(f)), map);
}
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
@@ -196,7 +152,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
@@ -225,26 +181,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
}
-static int inode_storage_map_btf_id;
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -255,8 +199,8 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &inode_storage_map_btf_id,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
@@ -267,7 +211,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -278,6 +222,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
new file mode 100644
index 000000000000..c96630cb75bf
--- /dev/null
+++ b/kernel/bpf/bpf_insn_array.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+
+struct bpf_insn_array {
+ struct bpf_map map;
+ atomic_t used;
+ long *ips;
+ DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
+};
+
+#define cast_insn_array(MAP_PTR) \
+ container_of((MAP_PTR), struct bpf_insn_array, map)
+
+#define INSN_DELETED ((u32)-1)
+
+static inline u64 insn_array_alloc_size(u32 max_entries)
+{
+ const u64 base_size = sizeof(struct bpf_insn_array);
+ const u64 entry_size = sizeof(struct bpf_insn_array_value);
+
+ return base_size + max_entries * (entry_size + sizeof(long));
+}
+
+static int insn_array_alloc_check(union bpf_attr *attr)
+{
+ u32 value_size = sizeof(struct bpf_insn_array_value);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != value_size || attr->map_flags != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void insn_array_free(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ bpf_map_area_free(insn_array);
+}
+
+static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
+{
+ u64 size = insn_array_alloc_size(attr->max_entries);
+ struct bpf_insn_array *insn_array;
+
+ insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
+ if (!insn_array)
+ return ERR_PTR(-ENOMEM);
+
+ /* ips are allocated right after the insn_array->values[] array */
+ insn_array->ips = (void *)&insn_array->values[attr->max_entries];
+
+ bpf_map_init_from_attr(&insn_array->map, attr);
+
+ /* BPF programs aren't allowed to write to the map */
+ insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
+
+ return &insn_array->map;
+}
+
+static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return NULL;
+
+ return &insn_array->values[index];
+}
+
+static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+ struct bpf_insn_array_value val = {};
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return -E2BIG;
+
+ if (unlikely(map_flags & BPF_NOEXIST))
+ return -EEXIST;
+
+ copy_map_value(map, &val, value);
+ if (val.jitted_off || val.xlated_off)
+ return -EINVAL;
+
+ insn_array->values[index].orig_off = val.orig_off;
+
+ return 0;
+}
+
+static long insn_array_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static int insn_array_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ if (!btf_type_is_i64(value_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u64 insn_array_mem_usage(const struct bpf_map *map)
+{
+ return insn_array_alloc_size(map->max_entries);
+}
+
+static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ if ((off % sizeof(long)) != 0 ||
+ (off / sizeof(long)) >= map->max_entries)
+ return -EINVAL;
+
+ /* from BPF's point of view, this map is a jump table */
+ *imm = (unsigned long)insn_array->ips + off;
+
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
+
+const struct bpf_map_ops insn_array_map_ops = {
+ .map_alloc_check = insn_array_alloc_check,
+ .map_alloc = insn_array_alloc,
+ .map_free = insn_array_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = insn_array_lookup_elem,
+ .map_update_elem = insn_array_update_elem,
+ .map_delete_elem = insn_array_delete_elem,
+ .map_check_btf = insn_array_check_btf,
+ .map_mem_usage = insn_array_mem_usage,
+ .map_direct_value_addr = insn_array_map_direct_value_addr,
+ .map_btf_id = &insn_array_btf_ids[0],
+};
+
+static inline bool is_frozen(struct bpf_map *map)
+{
+ guard(mutex)(&map->freeze_mutex);
+
+ return map->frozen;
+}
+
+static bool is_insn_array(const struct bpf_map *map)
+{
+ return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
+}
+
+static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
+ const struct bpf_prog *prog)
+{
+ u32 off;
+ int i;
+
+ for (i = 0; i < insn_array->map.max_entries; i++) {
+ off = insn_array->values[i].orig_off;
+
+ if (off >= prog->len)
+ return false;
+
+ if (off > 0) {
+ if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ struct bpf_insn_array_value *values = insn_array->values;
+ int i;
+
+ if (!is_frozen(map))
+ return -EINVAL;
+
+ if (!valid_offsets(insn_array, prog))
+ return -EINVAL;
+
+ /*
+ * There can be only one program using the map
+ */
+ if (atomic_xchg(&insn_array->used, 1))
+ return -EBUSY;
+
+ /*
+ * Reset all the map indexes to the original values. This is needed,
+ * e.g., when a replay of verification with different log level should
+ * be performed.
+ */
+ for (i = 0; i < map->max_entries; i++)
+ values[i].xlated_off = values[i].orig_off;
+
+ return 0;
+}
+
+int bpf_insn_array_ready(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (!insn_array->ips[i])
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+void bpf_insn_array_release(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ atomic_set(&insn_array->used, 0);
+}
+
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off <= off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ insn_array->values[i].xlated_off += len - 1;
+ }
+}
+
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off < off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (insn_array->values[i].xlated_off < off + len)
+ insn_array->values[i].xlated_off = INSN_DELETED;
+ else
+ insn_array->values[i].xlated_off -= len;
+ }
+}
+
+/*
+ * This function is called by JITs. The image is the real program
+ * image, the offsets array set up the xlated -> jitted mapping.
+ * The offsets[xlated] offset should point to the beginning of
+ * the jitted instruction.
+ */
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+ struct bpf_insn_array *insn_array;
+ struct bpf_map *map;
+ u32 xlated_off;
+ int i, j;
+
+ if (!offsets || !image)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (!is_insn_array(map))
+ continue;
+
+ insn_array = cast_insn_array(map);
+ for (j = 0; j < map->max_entries; j++) {
+ xlated_off = insn_array->values[j].xlated_off;
+ if (xlated_off == INSN_DELETED)
+ continue;
+ if (xlated_off < prog->aux->subprog_start)
+ continue;
+ xlated_off -= prog->aux->subprog_start;
+ if (xlated_off >= prog->len)
+ continue;
+
+ insn_array->values[j].jitted_off = offsets[xlated_off];
+ insn_array->ips[j] = (long)(image + offsets[xlated_off]);
+ }
+ }
+}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b7aef5b3416d..eec60b57bd3d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -37,8 +38,7 @@ static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
- const struct bpf_iter_seq_info *seq_info);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
@@ -67,23 +67,27 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
}
/* maximum visited objects before bailing out */
#define MAX_ITER_OBJECTS 1000000
/* bpf_seq_read, a customized and simpler version for bpf iterator.
- * no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
- * . assuming no_llseek
+ * . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
@@ -197,6 +201,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -247,7 +256,7 @@ static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
- return prepare_seq_file(file, link, __get_seq_info(link));
+ return prepare_seq_file(file, link);
}
static int iter_release(struct inode *inode, struct file *file)
@@ -273,7 +282,6 @@ static int iter_release(struct inode *inode, struct file *file)
const struct file_operations bpf_iter_fops = {
.open = iter_open,
- .llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
@@ -326,38 +334,36 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
tinfo->btf_id = prog->aux->attach_btf_id;
}
-bool bpf_iter_prog_supported(struct bpf_prog *prog)
+int bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
- struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
- bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
- return false;
+ return -EINVAL;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
- supported = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id && iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
- if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
- cache_btf_id(tinfo, prog);
- supported = true;
+ if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+ cache_btf_id(iter, prog);
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (supported) {
- prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
- prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
- }
+ if (!tinfo)
+ return -EINVAL;
- return supported;
+ return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info,
+ tinfo->reg_info->ctx_arg_info_size);
}
const struct bpf_func_proto *
@@ -498,12 +504,11 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
struct bpf_link_primer link_primer;
- struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
- bool existed = false;
bpfptr_t ulinfo;
int err;
@@ -529,24 +534,29 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id == prog_btf_id) {
- existed = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (!existed)
+ if (!tinfo)
return -ENOENT;
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
- bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog,
+ attr->link_create.attach_type);
link->tinfo = tinfo;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
@@ -576,9 +586,9 @@ static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
priv_data->done_stop = false;
}
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
- const struct bpf_iter_seq_info *seq_info)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
{
+ const struct bpf_iter_seq_info *seq_info = __get_seq_info(link);
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
@@ -624,37 +634,24 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
- struct file *file;
unsigned int flags;
- int err, fd;
+ int err;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- return fd;
-
- file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto free_fd;
- }
+
+ FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags));
+ if (fdf.err)
+ return fdf.err;
iter_link = container_of(link, struct bpf_iter_link, link);
- err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
+ err = prepare_seq_file(fd_prepare_file(fdf), iter_link);
if (err)
- goto free_file;
-
- fd_install(fd, file);
- return fd;
+ return err; /* Automatic cleanup handles fput */
-free_file:
- fput(file);
-free_fd:
- put_unused_fd(fd);
- return err;
+ return fd_publish(fdf);
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
@@ -682,13 +679,25 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = bpf_prog_run(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock_dont_migrate();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock_migrate();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
@@ -715,9 +724,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-/* maximum number of loops */
-#define MAX_LOOPS BIT(23)
-
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
@@ -725,9 +731,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64 ret;
u32 i;
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
if (flags)
return -EINVAL;
- if (nr_loops > MAX_LOOPS)
+ if (nr_loops > BPF_MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
@@ -749,3 +759,69 @@ const struct bpf_func_proto bpf_loop_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 71de2a89869c..e2fe6c32822b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner)
return map->ops->map_owner_storage_ptr(owner);
}
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -63,50 +73,196 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem)
+ void *value, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
- if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
+ if (smap->use_kmalloc_nolock) {
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (selem) {
- if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+
+ if (value) {
+ /* No need to call check_and_init_map_value as memory is zero init */
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
+ if (swap_uptrs)
+ bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+ }
return selem;
}
- if (charge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
return NULL;
}
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
+}
+
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_rcu(local_storage, rcu);
+ kfree_nolock(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!local_storage->use_kmalloc_nolock) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (reuse_now) {
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ return;
+ }
+
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
+}
+
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- kfree_rcu(selem, rcu);
+ /* The bpf_local_storage_map_free will wait for rcu_barrier */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
+ kfree_nolock(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ if (!smap->use_kmalloc_nolock) {
+ /*
+ * No uptr will be unpin even when reuse_now == false since uptr
+ * is only supported in task local storage, where
+ * smap->use_kmalloc_nolock == true.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (reuse_now) {
+ /*
+ * While it is okay to call bpf_obj_free_fields() that unpins uptr when
+ * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
+ */
+ call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ return;
+ }
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+ struct bpf_local_storage_elem *selem;
+ struct hlist_node *n;
+
+ /* The "_safe" iteration is needed.
+ * The loop is not removing the selem from the list
+ * but bpf_selem_free will use the selem->rcu_head
+ * which is union-ized with the selem->free_node.
+ */
+ hlist_for_each_entry_safe(selem, n, list, free_node)
+ bpf_selem_free(selem, reuse_now);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem,
- bool uncharge_mem)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -119,8 +275,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* The owner may be freed once the last selem is unlinked
* from local_storage.
*/
- if (uncharge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
@@ -136,7 +291,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* will be done by the caller.
*
* Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
+ * rcu_read_lock(), it is more intuitive to
* read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
@@ -150,31 +305,39 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+ hlist_add_head(&selem->free_node, free_selem_list);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
+
return free_local_storage;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
+ HLIST_HEAD(selem_free_list);
unsigned long flags;
- if (unlikely(!selem_linked_to_storage(selem)))
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true);
+ local_storage, selem, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&selem_free_list, reuse_now);
+
if (free_local_storage)
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_rcu);
+ bpf_local_storage_free(local_storage, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -184,13 +347,13 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
unsigned long flags;
- if (unlikely(!selem_linked_to_map(selem)))
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
/* selem has already be unlinked from smap */
return;
@@ -209,61 +372,35 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
unsigned long flags;
raw_spin_lock_irqsave(&b->lock, flags);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem);
+ bpf_selem_unlink_storage(selem, reuse_now);
}
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *smap,
- bool cacheit_lockit)
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_local_storage_data *sdata;
- struct bpf_local_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
- bpf_rcu_lock_held());
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
- rcu_read_lock_trace_held())
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- unsigned long flags;
-
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next bpf_local_storage_lookup().
- */
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- if (selem_linked_to_storage(selem))
- rcu_assign_pointer(local_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- }
+ unsigned long flags;
- return sdata;
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
@@ -282,7 +419,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -292,16 +430,22 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
+ if (smap->use_kmalloc_nolock)
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
+ else
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
+ RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
+ storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
bpf_selem_link_storage_nolock(storage, first_selem);
bpf_selem_link_map(smap, first_selem);
@@ -323,22 +467,12 @@ int bpf_local_storage_alloc(void *owner,
bpf_selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_local_storage_map_free() does a
- * synchronize_rcu_mult (waiting for both sleepable and
- * normal programs) before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
}
return 0;
uncharge:
- kfree(storage);
+ bpf_local_storage_free(storage, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -350,11 +484,12 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+ void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ HLIST_HEAD(old_selem_free_list);
unsigned long flags;
int err;
@@ -362,7 +497,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(&smap->map)))
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
+ return ERR_PTR(-EINVAL);
+
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
return ERR_PTR(-EINVAL);
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
@@ -373,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- kfree(selem);
+ bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -397,13 +535,20 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
- if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
copy_map_value_locked(&smap->map, old_sdata->data,
value, false);
return old_sdata;
}
}
+ /* A lookup has just been done before and concluded a new selem is
+ * needed. The chance of an unnecessary alloc is unlikely.
+ */
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ if (!alloc_selem)
+ return ERR_PTR(-ENOMEM);
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
@@ -414,13 +559,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
* simple.
*/
err = -EAGAIN;
- goto unlock_err;
+ goto unlock;
}
old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
- goto unlock_err;
+ goto unlock;
if (old_sdata && (map_flags & BPF_F_LOCK)) {
copy_map_value_locked(&smap->map, old_sdata->data, value,
@@ -429,21 +574,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
- }
-
+ alloc_selem = NULL;
/* First, link the new selem to the map */
bpf_selem_link_map(smap, selem);
@@ -454,19 +585,20 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false);
+ &old_selem_free_list);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return SDATA(selem);
-
-unlock_err:
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return ERR_PTR(err);
+ bpf_selem_free_list(&old_selem_free_list, false);
+ if (alloc_selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ bpf_selem_free(alloc_selem, true);
+ }
+ return err ? ERR_PTR(err) : SDATA(selem);
}
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
@@ -490,21 +622,153 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
return res;
}
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
- u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
- int __percpu *busy_counter)
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
struct bpf_local_storage_elem *selem;
+ bool free_storage = false;
+ HLIST_HEAD(free_selem_list);
+ struct hlist_node *n;
+ unsigned long flags;
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, &free_selem_list);
+ }
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ bpf_selem_free_list(&free_selem_list, true);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, true);
+}
+
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache,
+ bool use_kmalloc_nolock)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
+
+ /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
+ * preemptible context. Thus, enforce all storages to use
+ * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
+ */
+ smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
unsigned int i;
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
/* Note that this map might be concurrently cloned from
* bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
* RCU read section to finish before proceeding. New RCU
@@ -528,15 +792,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
- __this_cpu_inc(*busy_counter);
- }
- bpf_selem_unlink(selem);
- if (busy_counter) {
- __this_cpu_dec(*busy_counter);
- migrate_enable();
- }
+ if (busy_counter)
+ this_cpu_inc(*busy_counter);
+ bpf_selem_unlink(selem, true);
+ if (busy_counter)
+ this_cpu_dec(*busy_counter);
cond_resched_rcu();
}
rcu_read_unlock();
@@ -556,76 +816,10 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
*/
synchronize_rcu();
- kvfree(smap->buckets);
- kfree(smap);
-}
-
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
-{
- if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
- attr->max_entries ||
- attr->key_size != sizeof(int) || !attr->value_size ||
- /* Enforce BTF for userspace sk dumping */
- !attr->btf_key_type_id || !attr->btf_value_type_id)
- return -EINVAL;
-
- if (!bpf_capable())
- return -EPERM;
-
- if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
- return -E2BIG;
-
- return 0;
-}
-
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
- struct bpf_local_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
-
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
-
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!smap->buckets) {
- kfree(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
+ if (smap->use_kmalloc_nolock) {
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
}
-
- smap->elem_size =
- sizeof(struct bpf_local_storage_elem) + attr->value_size;
-
- return smap;
-}
-
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
- const struct btf *btf,
- const struct btf_type *key_type,
- const struct btf_type *value_type)
-{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- return 0;
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index d99e89f113c4..e7a2fc60523f 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -19,14 +19,6 @@
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
-static int get_next_cpu(int cpu)
-{
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_possible_mask);
- return cpu;
-}
-
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
@@ -41,7 +33,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
- return node->ref;
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -89,7 +86,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -110,7 +107,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
@@ -332,12 +329,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
- if (++nfree == LOCAL_FREE_TARGET)
+ if (++nfree == lru->target_free)
break;
}
- if (nfree < LOCAL_FREE_TARGET)
- __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ if (nfree < lru->target_free)
+ __bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
@@ -353,7 +350,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, local_pending_list(loc_l));
}
@@ -419,7 +416,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
@@ -477,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
- steal = get_next_cpu(steal);
+ steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
@@ -522,7 +519,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -568,10 +565,13 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
+
+ lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
+ 1, LOCAL_FREE_TARGET);
}
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
@@ -594,7 +594,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 6b12f06ee18c..fe2661a58ea9 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -4,6 +4,7 @@
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
+#include <linux/cache.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
@@ -57,17 +58,15 @@ struct bpf_lru {
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
+ unsigned int target_free;
unsigned int nr_scans;
bool percpu;
};
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
- /* ref is an approximation on access frequency. It does not
- * have to be very accurate. Hence, no protection is used.
- */
- if (!node->ref)
- node->ref = 1;
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
@@ -77,6 +76,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
-void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 06062370c3b8..7cb6e8d4282c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,11 +11,11 @@
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
-#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -35,18 +35,104 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+BTF_SET_START(bpf_lsm_disabled_hooks)
+BTF_ID(func, bpf_lsm_vm_enough_memory)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_getsecurity)
+BTF_ID(func, bpf_lsm_inode_listsecurity)
+BTF_ID(func, bpf_lsm_inode_copy_up_xattr)
+BTF_ID(func, bpf_lsm_getselfattr)
+BTF_ID(func, bpf_lsm_getprocattr)
+BTF_ID(func, bpf_lsm_setprocattr)
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_getsecurity)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_match)
+#endif
+BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_SET_END(bpf_lsm_disabled_hooks)
+
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
+
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char *func_name = prog->aux->attach_func_name;
+
if (!prog->gpl_compatible) {
bpf_log(vlog,
"LSM programs must have a GPL compatible license\n");
return -EINVAL;
}
- if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
+ if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
- prog->aux->attach_btf_id, prog->aux->attach_func_name);
+ btf_id, func_name);
return -EINVAL;
}
@@ -91,6 +177,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
@@ -99,9 +186,51 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+ .func = bpf_get_attach_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -120,7 +249,33 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
- return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ return &bpf_ima_inode_hash_proto;
+ case BPF_FUNC_ima_file_hash:
+ return &bpf_ima_file_hash_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -132,9 +287,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -146,12 +307,9 @@ BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
BTF_ID(func, bpf_lsm_file_receive)
-#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_inet_conn_established)
-#endif /* CONFIG_SECURITY_NETWORK */
-
BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
@@ -159,7 +317,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_removexattr)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
@@ -167,11 +327,20 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
-#ifdef CONFIG_KEYS
-BTF_ID(func, bpf_lsm_key_free)
-#endif /* CONFIG_KEYS */
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
@@ -189,6 +358,8 @@ BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
@@ -207,18 +378,34 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid_subj)
-BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
@@ -226,3 +413,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = {
.get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
+
+/* hooks return 0 or 1 */
+BTF_SET_START(bool_lsm_hooks)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_known)
+#endif
+BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
+BTF_SET_END(bool_lsm_hooks)
+
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *retval_range)
+{
+ /* no return value range for void hooks */
+ if (!prog->aux->attach_func_proto->type)
+ return -EINVAL;
+
+ if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) {
+ retval_range->minval = 0;
+ retval_range->maxval = 1;
+ } else {
+ /* All other available LSM hooks, except task_prctl, return 0
+ * on success and negative error code on failure.
+ * To keep things simple, we only allow bpf progs to return 0
+ * or negative errno for task_prctl too.
+ */
+ retval_range->minval = -MAX_ERRNO;
+ retval_range->maxval = 0;
+ }
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 21069dbe9138..278490683d28 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -10,39 +10,37 @@
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
-
-enum bpf_struct_ops_state {
- BPF_STRUCT_OPS_STATE_INIT,
- BPF_STRUCT_OPS_STATE_INUSE,
- BPF_STRUCT_OPS_STATE_TOBEFREE,
-};
-
-#define BPF_STRUCT_OPS_COMMON_VALUE \
- refcount_t refcnt; \
- enum bpf_struct_ops_state state
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/poll.h>
struct bpf_struct_ops_value {
- BPF_STRUCT_OPS_COMMON_VALUE;
+ struct bpf_struct_ops_common_value common;
char data[] ____cacheline_aligned_in_smp;
};
+#define MAX_TRAMP_IMAGE_PAGES 8
+
struct bpf_struct_ops_map {
struct bpf_map map;
- struct rcu_head rcu;
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
- /* progs has all the bpf_prog that is populated
+ /* link has all the bpf_links that is populated
* to the func ptr of the kernel's struct
* (in kvalue.data).
*/
- struct bpf_prog **progs;
- /* image is a page that has all the trampolines
+ struct bpf_link **links;
+ /* ksyms for bpf trampolines */
+ struct bpf_ksym **ksyms;
+ u32 funcs_cnt;
+ u32 image_pages_cnt;
+ /* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
- * A PAGE_SIZE "image" is enough to store all trampoline for
- * "progs[]".
*/
- void *image;
+ void *image_pages[MAX_TRAMP_IMAGE_PAGES];
+ /* The owner moduler's btf. */
+ struct btf *btf;
/* uvalue->data stores the kernel struct
* (e.g. tcp_congestion_ops) that is more useful
* to userspace than the kvalue. For example,
@@ -57,37 +55,16 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
-#define VALUE_PREFIX "bpf_struct_ops_"
-#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
-
-/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
- * the map's value exposed to the userspace and its btf-type-id is
- * stored at the map->btf_vmlinux_value_type_id.
- *
- */
-#define BPF_STRUCT_OPS_TYPE(_name) \
-extern struct bpf_struct_ops bpf_##_name; \
- \
-struct bpf_struct_ops_##_name { \
- BPF_STRUCT_OPS_COMMON_VALUE; \
- struct _name data ____cacheline_aligned_in_smp; \
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
};
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-enum {
-#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
- __NR_BPF_STRUCT_OPS_TYPE,
-};
+static DEFINE_MUTEX(update_mutex);
-static struct bpf_struct_ops * const bpf_struct_ops[] = {
-#define BPF_STRUCT_OPS_TYPE(_name) \
- [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-};
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
@@ -98,138 +75,398 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
#endif
};
-static const struct btf_type *module_type;
+BTF_ID_LIST(st_ops_ids)
+BTF_ID(struct, module)
+BTF_ID(struct, bpf_struct_ops_common_value)
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+enum {
+ IDX_MODULE_ID,
+ IDX_ST_OPS_COMMON_VALUE_ID,
+};
+
+extern struct btf *btf_vmlinux;
+
+static bool is_valid_value_type(struct btf *btf, s32 value_id,
+ const struct btf_type *type,
+ const char *value_name)
{
- s32 type_id, value_id, module_id;
+ const struct btf_type *common_value_type;
const struct btf_member *member;
- struct bpf_struct_ops *st_ops;
- const struct btf_type *t;
- char value_name[128];
- const char *mname;
- u32 i, j;
+ const struct btf_type *vt, *mt;
- /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
+ vt = btf_type_by_id(btf, value_id);
+ if (btf_vlen(vt) != 2) {
+ pr_warn("The number of %s's members should be 2, but we get %d\n",
+ value_name, btf_vlen(vt));
+ return false;
+ }
+ member = btf_type_member(vt);
+ mt = btf_type_by_id(btf, member->type);
+ common_value_type = btf_type_by_id(btf_vmlinux,
+ st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]);
+ if (mt != common_value_type) {
+ pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
+ value_name);
+ return false;
+ }
+ member++;
+ mt = btf_type_by_id(btf, member->type);
+ if (mt != type) {
+ pr_warn("The second member of %s should be %s\n",
+ value_name, btf_name_by_offset(btf, type->name_off));
+ return false;
+ }
- module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
- if (module_id < 0) {
- pr_warn("Cannot find struct module in btf_vmlinux\n");
- return;
+ return true;
+}
+
+static void *bpf_struct_ops_image_alloc(void)
+{
+ void *image;
+ int err;
+
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (err)
+ return ERR_PTR(err);
+ image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!image) {
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ return ERR_PTR(-ENOMEM);
}
- module_type = btf_type_by_id(btf, module_id);
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- st_ops = bpf_struct_ops[i];
+ return image;
+}
- if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
- sizeof(value_name)) {
- pr_warn("struct_ops name %s is too long\n",
- st_ops->name);
+void bpf_struct_ops_image_free(void *image)
+{
+ if (image) {
+ arch_free_bpf_trampoline(image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
+}
+
+#define MAYBE_NULL_SUFFIX "__nullable"
+#define REFCOUNTED_SUFFIX "__ref"
+
+/* Prepare argument info for every nullable argument of a member of a
+ * struct_ops type.
+ *
+ * Initialize a struct bpf_struct_ops_arg_info according to type info of
+ * the arguments of a stub function. (Check kCFI for more information about
+ * stub functions.)
+ *
+ * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info
+ * to provide an array of struct bpf_ctx_arg_aux, which in turn provides
+ * the information that used by the verifier to check the arguments of the
+ * BPF struct_ops program assigned to the member. Here, we only care about
+ * the arguments that are marked as __nullable.
+ *
+ * The array of struct bpf_ctx_arg_aux is eventually assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the
+ * verifier. (See check_struct_ops_btf_id())
+ *
+ * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If
+ * fails, it will be kept untouched.
+ */
+static int prepare_arg_info(struct btf *btf,
+ const char *st_ops_name,
+ const char *member_name,
+ const struct btf_type *func_proto, void *stub_func_addr,
+ struct bpf_struct_ops_arg_info *arg_info)
+{
+ const struct btf_type *stub_func_proto, *pointed_type;
+ bool is_nullable = false, is_refcounted = false;
+ const struct btf_param *stub_args, *args;
+ struct bpf_ctx_arg_aux *info, *info_buf;
+ u32 nargs, arg_no, info_cnt = 0;
+ char ksym[KSYM_SYMBOL_LEN];
+ const char *stub_fname;
+ const char *suffix;
+ s32 stub_func_id;
+ u32 arg_btf_id;
+ int offset;
+
+ stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym);
+ if (!stub_fname) {
+ pr_warn("Cannot find the stub function name for the %s in struct %s\n",
+ member_name, st_ops_name);
+ return -ENOENT;
+ }
+
+ stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC);
+ if (stub_func_id < 0) {
+ pr_warn("Cannot find the stub function %s in btf\n", stub_fname);
+ return -ENOENT;
+ }
+
+ stub_func_proto = btf_type_by_id(btf, stub_func_id);
+ stub_func_proto = btf_type_by_id(btf, stub_func_proto->type);
+
+ /* Check if the number of arguments of the stub function is the same
+ * as the number of arguments of the function pointer.
+ */
+ nargs = btf_type_vlen(func_proto);
+ if (nargs != btf_type_vlen(stub_func_proto)) {
+ pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
+ stub_fname, member_name, st_ops_name);
+ return -EINVAL;
+ }
+
+ if (!nargs)
+ return 0;
+
+ args = btf_params(func_proto);
+ stub_args = btf_params(stub_func_proto);
+
+ info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /* Prepare info for every nullable argument */
+ info = info_buf;
+ for (arg_no = 0; arg_no < nargs; arg_no++) {
+ /* Skip arguments that is not suffixed with
+ * "__nullable or __ref".
+ */
+ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX);
+ is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
+ REFCOUNTED_SUFFIX);
+
+ if (is_nullable)
+ suffix = MAYBE_NULL_SUFFIX;
+ else if (is_refcounted)
+ suffix = REFCOUNTED_SUFFIX;
+ else
continue;
+
+ /* Should be a pointer to struct */
+ pointed_type = btf_type_resolve_ptr(btf,
+ args[arg_no].type,
+ &arg_btf_id);
+ if (!pointed_type ||
+ !btf_type_is_struct(pointed_type)) {
+ pr_warn("stub function %s has %s tagging to an unsupported type\n",
+ stub_fname, suffix);
+ goto err_out;
}
- sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
- value_id = btf_find_by_name_kind(btf, value_name,
- BTF_KIND_STRUCT);
- if (value_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- value_name);
- continue;
+ offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
+ if (offset < 0) {
+ pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
+ stub_fname, arg_no);
+ goto err_out;
}
- type_id = btf_find_by_name_kind(btf, st_ops->name,
- BTF_KIND_STRUCT);
- if (type_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- st_ops->name);
- continue;
+ if (args[arg_no].type != stub_args[arg_no].type) {
+ pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
+ arg_no, stub_fname);
+ goto err_out;
}
- t = btf_type_by_id(btf, type_id);
- if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
- pr_warn("Cannot support #%u members in struct %s\n",
- btf_type_vlen(t), st_ops->name);
- continue;
+
+ /* Fill the information of the new argument */
+ info->btf_id = arg_btf_id;
+ info->btf = btf;
+ info->offset = offset;
+ if (is_nullable) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ } else if (is_refcounted) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
+ info->refcounted = true;
}
- for_each_member(j, t, member) {
- const struct btf_type *func_proto;
+ info++;
+ info_cnt++;
+ }
- mname = btf_name_by_offset(btf, member->name_off);
- if (!*mname) {
- pr_warn("anon member in struct %s is not supported\n",
- st_ops->name);
- break;
- }
+ if (info_cnt) {
+ arg_info->info = info_buf;
+ arg_info->cnt = info_cnt;
+ } else {
+ kfree(info_buf);
+ }
- if (__btf_member_bitfield_size(t, member)) {
- pr_warn("bit field member %s in struct %s is not supported\n",
- mname, st_ops->name);
- break;
- }
+ return 0;
- func_proto = btf_type_resolve_func_ptr(btf,
- member->type,
- NULL);
- if (func_proto &&
- btf_distill_func_proto(log, btf,
- func_proto, mname,
- &st_ops->func_models[j])) {
- pr_warn("Error in parsing func ptr %s in struct %s\n",
- mname, st_ops->name);
- break;
- }
- }
+err_out:
+ kfree(info_buf);
- if (j == btf_type_vlen(t)) {
- if (st_ops->init(btf)) {
- pr_warn("Error in init bpf_struct_ops %s\n",
- st_ops->name);
- } else {
- st_ops->type_id = type_id;
- st_ops->type = t;
- st_ops->value_id = value_id;
- st_ops->value_type = btf_type_by_id(btf,
- value_id);
- }
- }
- }
+ return -EINVAL;
}
-extern struct btf *btf_vmlinux;
+/* Clean up the arg_info in a struct bpf_struct_ops_desc. */
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+ struct bpf_struct_ops_arg_info *arg_info;
+ int i;
+
+ arg_info = st_ops_desc->arg_info;
+ for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
+ kfree(arg_info[i].info);
+
+ kfree(arg_info);
+}
-static const struct bpf_struct_ops *
-bpf_struct_ops_find_value(u32 value_id)
+static bool is_module_member(const struct btf *btf, u32 id)
{
- unsigned int i;
+ const struct btf_type *t;
- if (!value_id || !btf_vmlinux)
- return NULL;
+ t = btf_type_resolve_ptr(btf, id, NULL);
+ if (!t)
+ return false;
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->value_id == value_id)
- return bpf_struct_ops[i];
- }
+ if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t))
+ return false;
- return NULL;
+ return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
}
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
{
- unsigned int i;
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+ struct btf *btf,
+ struct bpf_verifier_log *log)
+{
+ struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+ struct bpf_struct_ops_arg_info *arg_info;
+ const struct btf_member *member;
+ const struct btf_type *t;
+ s32 type_id, value_id;
+ char value_name[128];
+ const char *mname;
+ int i, err;
- if (!type_id || !btf_vmlinux)
- return NULL;
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ return -EINVAL;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ if (!st_ops->cfi_stubs) {
+ pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name);
+ return -EINVAL;
+ }
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->type_id == type_id)
- return bpf_struct_ops[i];
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ st_ops->name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ return -EINVAL;
+ }
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ value_name, btf_get_name(btf));
+ return -EINVAL;
}
+ if (!is_valid_value_type(btf, value_id, t, value_name))
+ return -EINVAL;
+
+ arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info),
+ GFP_KERNEL);
+ if (!arg_info)
+ return -ENOMEM;
+
+ st_ops_desc->arg_info = arg_info;
+ st_ops_desc->type = t;
+ st_ops_desc->type_id = type_id;
+ st_ops_desc->value_id = value_id;
+ st_ops_desc->value_type = btf_type_by_id(btf, value_id);
+
+ for_each_member(i, t, member) {
+ const struct btf_type *func_proto, *ret_type;
+ void **stub_func_addr;
+ u32 moff;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
- return NULL;
+ if (__btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
+ pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+
+ /* The member is not a function pointer or
+ * the function pointer is not supported.
+ */
+ if (!func_proto || bpf_struct_ops_supported(st_ops, moff))
+ continue;
+
+ if (func_proto->type) {
+ ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL);
+ if (ret_type && !__btf_type_is_struct(ret_type)) {
+ pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ if (btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[i])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ stub_func_addr = *(void **)(st_ops->cfi_stubs + moff);
+ err = prepare_arg_info(btf, st_ops->name, mname,
+ func_proto, stub_func_addr,
+ arg_info + i);
+ if (err)
+ goto errout;
+ }
+
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ return 0;
+
+errout:
+ bpf_struct_ops_desc_release(st_ops_desc);
+
+ return err;
}
static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
@@ -248,13 +485,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
kvalue = &st_map->kvalue;
/* Pair with smp_store_release() during map_update */
- state = smp_load_acquire(&kvalue->state);
+ state = smp_load_acquire(&kvalue->common.state);
if (state == BPF_STRUCT_OPS_STATE_INIT) {
memset(value, 0, map->value_size);
return 0;
@@ -263,10 +501,17 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
/* No lock is needed. state and refcnt do not need
* to be updated together under atomic context.
*/
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
- uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+ uvalue->common.state = state;
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -278,18 +523,26 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
- const struct btf_type *t = st_map->st_ops->type;
u32 i;
- for (i = 0; i < btf_type_vlen(t); i++) {
- if (st_map->progs[i]) {
- bpf_prog_put(st_map->progs[i]);
- st_map->progs[i] = NULL;
- }
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
-static int check_zero_holes(const struct btf_type *t, void *data)
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+ int i;
+
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ bpf_struct_ops_image_free(st_map->image_pages[i]);
+ st_map->image_pages_cnt = 0;
+}
+
+static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{
const struct btf_member *member;
u32 i, moff, msize, prev_mend = 0;
@@ -301,8 +554,8 @@ static int check_zero_holes(const struct btf_type *t, void *data)
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(btf, member->type);
+ mtype = btf_resolve_size(btf, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -315,33 +568,129 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
- struct bpf_prog *prog,
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+ kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+ .release = bpf_struct_ops_link_release,
+ .dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_link *link,
const struct btf_func_model *model,
- void *image, void *image_end)
+ void *stub_func,
+ void **_image, u32 *_image_off,
+ bool allow_alloc)
+{
+ u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+ void *image = *_image;
+ int size;
+
+ tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+ tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+
+ if (model->ret_size > 0)
+ flags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
+ if (size <= 0)
+ return size ? : -EFAULT;
+
+ /* Allocate image buffer if necessary */
+ if (!image || size > PAGE_SIZE - image_off) {
+ if (!allow_alloc)
+ return -E2BIG;
+
+ image = bpf_struct_ops_image_alloc();
+ if (IS_ERR(image))
+ return PTR_ERR(image);
+ image_off = 0;
+ }
+
+ size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+ image + image_off + size,
+ model, flags, tlinks, stub_func);
+ if (size <= 0) {
+ if (image != *_image)
+ bpf_struct_ops_image_free(image);
+ return size ? : -EFAULT;
+ }
+
+ *_image = image;
+ *_image_off = image_off + size;
+ return 0;
+}
+
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+ void *image, unsigned int size,
+ struct bpf_ksym *ksym)
+{
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_add(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_del(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
{
- u32 flags;
+ u32 i;
- tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
- tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
- return arch_prepare_bpf_trampoline(NULL, image, image_end,
- model, flags, tprogs, NULL);
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ kfree(st_map->ksyms[i]);
+ st_map->ksyms[i] = NULL;
+ }
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_type *module_type;
const struct btf_member *member;
- const struct btf_type *t = st_ops->type;
- struct bpf_tramp_progs *tprogs = NULL;
+ const struct btf_type *t = st_ops_desc->type;
+ struct bpf_tramp_links *tlinks;
void *udata, *kdata;
- int prog_fd, err = 0;
- void *image, *image_end;
- u32 i;
+ int prog_fd, err;
+ u32 i, trampoline_start, image_off = 0;
+ void *cur_image = NULL, *image = NULL;
+ struct bpf_link **plink;
+ struct bpf_ksym **pksym;
+ const char *tname, *mname;
if (flags)
return -EINVAL;
@@ -349,20 +698,20 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (*(u32 *)key != 0)
return -E2BIG;
- err = check_zero_holes(st_ops->value_type, value);
+ err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value);
if (err)
return err;
- uvalue = (struct bpf_struct_ops_value *)value;
- err = check_zero_holes(t, uvalue->data);
+ uvalue = value;
+ err = check_zero_holes(st_map->btf, t, uvalue->data);
if (err)
return err;
- if (uvalue->state || refcount_read(&uvalue->refcnt))
+ if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
return -EINVAL;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -370,7 +719,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
mutex_lock(&st_map->lock);
- if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) {
err = -EBUSY;
goto unlock;
}
@@ -379,16 +728,21 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
- image = st_map->image;
- image_end = st_map->image + PAGE_SIZE;
+ plink = st_map->links;
+ pksym = st_map->ksyms;
+ tname = btf_name_by_offset(st_map->btf, t->name_off);
+ module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
+ struct bpf_tramp_link *link;
+ struct bpf_ksym *ksym;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
- ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ mname = btf_name_by_offset(st_map->btf, member->name_off);
+ ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
goto reset_unlock;
@@ -413,8 +767,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (!ptype || !btf_type_is_func_proto(ptype)) {
u32 msize;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(st_map->btf, member->type);
+ mtype = btf_resolve_size(st_map->btf, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
@@ -438,77 +792,134 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = PTR_ERR(prog);
goto reset_unlock;
}
- st_map->progs[i] = prog;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
- prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->aux->attach_btf_id != st_ops_desc->type_id ||
prog->expected_attach_type != i) {
+ bpf_prog_put(prog);
err = -EINVAL;
goto reset_unlock;
}
- err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
- &st_ops->func_models[i],
- image, image_end);
- if (err < 0)
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ bpf_prog_put(prog);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
+ *plink++ = &link->link;
+
+ ksym = kzalloc(sizeof(*ksym), GFP_USER);
+ if (!ksym) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ *pksym++ = ksym;
+
+ trampoline_start = image_off;
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ &image, &image_off,
+ st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+ if (err)
goto reset_unlock;
- *(void **)(kdata + moff) = image;
- image += err;
+ if (cur_image != image) {
+ st_map->image_pages[st_map->image_pages_cnt++] = image;
+ cur_image = image;
+ trampoline_start = 0;
+ }
+
+ *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
+
+ /* init ksym for this trampoline */
+ bpf_struct_ops_ksym_init(tname, mname,
+ image + trampoline_start,
+ image_off - trampoline_start,
+ ksym);
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
+ for (i = 0; i < st_map->image_pages_cnt; i++) {
+ err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
+ }
+
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = 0;
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
- err = st_ops->reg(kdata);
+ err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
*/
- smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE);
goto unlock;
}
- /* Error during st_ops->reg(). It is very unlikely since
- * the above init_member() should have caught it earlier
- * before reg(). The only possibility is if there was a race
- * in registering the struct_ops (under the same name) to
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- set_memory_nx((long)st_map->image, 1);
- set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
+ bpf_struct_ops_map_free_ksyms(st_map);
+ bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
- kfree(tprogs);
+ kfree(tlinks);
mutex_unlock(&st_map->lock);
+ if (!err)
+ bpf_struct_ops_map_add_ksyms(st_map);
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
- prev_state = cmpxchg(&st_map->kvalue.state,
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
+ prev_state = cmpxchg(&st_map->kvalue.common.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -524,6 +935,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
void *value;
int err;
@@ -533,54 +945,134 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
if (!err) {
- btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ btf_type_seq_show(st_map->btf,
+ map->btf_vmlinux_value_type_id,
value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- if (st_map->progs)
+ if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
- bpf_map_area_free(st_map->progs);
- bpf_jit_free_exec(st_map->image);
+ if (st_map->ksyms)
+ bpf_struct_ops_map_free_ksyms(st_map);
+ bpf_map_area_free(st_map->links);
+ bpf_map_area_free(st_map->ksyms);
+ bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ /* st_ops->owner was acquired during map_alloc to implicitly holds
+ * the btf's refcnt. The acquire was only done when btf_is_module()
+ * st_map->btf cannot be NULL here.
+ */
+ if (btf_is_module(st_map->btf))
+ module_put(st_map->st_ops_desc->st_ops->owner);
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) ||
+ !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+ int i;
+ u32 count;
+ const struct btf_member *member;
+
+ count = 0;
+ for_each_member(i, t, member)
+ if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+ count++;
+ return count;
+}
+
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
+ struct module *mod = NULL;
struct bpf_map *map;
+ struct btf *btf;
+ int ret;
+
+ if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
+ /* The map holds btf for its whole life time. */
+ btf = btf_get_by_fd(attr->value_type_btf_obj_fd);
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf_is_module(btf)) {
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
+ mod = btf_try_get_module(btf);
+ /* mod holds a refcnt to btf. We don't need an extra refcnt
+ * here.
+ */
+ btf_put(btf);
+ if (!mod)
+ return ERR_PTR(-EINVAL);
+ } else {
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf)
+ return ERR_PTR(-ENOTSUPP);
+ }
- st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
- if (!st_ops)
- return ERR_PTR(-ENOTSUPP);
+ st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
+ if (!st_ops_desc) {
+ ret = -ENOTSUPP;
+ goto errout;
+ }
- vt = st_ops->value_type;
- if (attr->value_size != vt->size)
- return ERR_PTR(-EINVAL);
+ vt = st_ops_desc->value_type;
+ if (attr->value_size != vt->size) {
+ ret = -EINVAL;
+ goto errout;
+ }
- t = st_ops->type;
+ t = st_ops_desc->type;
st_map_size = sizeof(*st_map) +
/* kvalue stores the
@@ -589,30 +1081,59 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
(vt->size - sizeof(struct bpf_struct_ops_value));
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
- if (!st_map)
- return ERR_PTR(-ENOMEM);
+ if (!st_map) {
+ ret = -ENOMEM;
+ goto errout;
+ }
- st_map->st_ops = st_ops;
+ st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->progs =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ st_map->funcs_cnt = count_func_ptrs(btf, t);
+ st_map->links =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
NUMA_NO_NODE);
- st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->progs || !st_map->image) {
- bpf_struct_ops_map_free(map);
- return ERR_PTR(-ENOMEM);
+
+ st_map->ksyms =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
+ NUMA_NO_NODE);
+ if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
+ ret = -ENOMEM;
+ goto errout_free;
}
+ st_map->btf = btf;
mutex_init(&st_map->lock);
- set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
return map;
+
+errout_free:
+ __bpf_struct_ops_map_free(map);
+errout:
+ module_put(mod);
+
+ return ERR_PTR(ret);
}
-static int bpf_struct_ops_map_btf_id;
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct btf_type *vt = st_ops_desc->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -622,8 +1143,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
- .map_btf_name = "bpf_struct_ops_map",
- .map_btf_id = &bpf_struct_ops_map_btf_id,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
+ .map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
/* "const void *" because some subsystem is
@@ -632,41 +1153,252 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_get);
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
{
+ struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
bpf_map_put(&st_map->map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_put);
-void bpf_struct_ops_put(const void *kdata)
+u32 bpf_struct_ops_id(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
- */
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+ return st_map->map.id;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+ bpf_map_put(&st_map->map);
+ }
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ if (!st_map->st_ops_desc->st_ops->update)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
}
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
+ attr->link_create.attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
+ if (err) {
+ mutex_unlock(&update_mutex);
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ info->btf_vmlinux_id = btf_obj_id(st_map->btf);
}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
deleted file mode 100644
index 5678a9ddf817..000000000000
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* internal file - do not include directly */
-
-#ifdef CONFIG_BPF_JIT
-#ifdef CONFIG_NET
-BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
-#endif
-#ifdef CONFIG_INET
-#include <net/tcp.h>
-BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
-#endif
-#endif
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 5da7bed0f5f6..a1dc1bf0848a 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -16,7 +16,6 @@
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -25,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
- migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ cant_migrate();
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
+ this_cpu_dec(bpf_task_storage_busy);
}
static bool bpf_task_storage_trylock(void)
{
- migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
+ cant_migrate();
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
return false;
}
return true;
@@ -71,48 +68,19 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
- bool free_task_storage = false;
- struct hlist_node *n;
- unsigned long flags;
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(task->bpf_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
bpf_task_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
- }
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
- rcu_read_unlock();
-
- /* free_task_storage should always be true as long as
- * local_storage->list was non-empty.
- */
- if (free_task_storage)
- kfree_rcu(local_storage, rcu);
+out:
+ rcu_read_unlock_migrate();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -148,8 +116,8 @@ out:
return ERR_PTR(err);
}
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
@@ -157,6 +125,9 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
struct pid *pid;
int fd, err;
+ if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+ return -EOPNOTSUPP;
+
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
@@ -174,7 +145,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
- task, (struct bpf_local_storage_map *)map, value, map_flags);
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ true, GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -183,7 +155,8 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
{
struct bpf_local_storage_data *sdata;
@@ -191,12 +164,15 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ if (!nobusy)
+ return -EBUSY;
+
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
@@ -219,62 +195,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
}
bpf_task_storage_lock();
- err = task_storage_delete(task, map);
+ err = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags)
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
{
struct bpf_local_storage_data *sdata;
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
- return (unsigned long)NULL;
-
- if (!task)
- return (unsigned long)NULL;
-
- if (!bpf_task_storage_trylock())
- return (unsigned long)NULL;
-
- sdata = task_storage_lookup(task, map, true);
+ sdata = task_storage_lookup(task, map, nobusy);
if (sdata)
- goto unlock;
+ return sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, false, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
-unlock:
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
bpf_task_storage_unlock();
- return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
+ return (unsigned long)data;
}
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
task)
{
+ bool nobusy;
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- if (!bpf_task_storage_trylock())
- return -EBUSY;
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map);
+ ret = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
return ret;
}
@@ -286,26 +308,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
}
-static int task_storage_map_btf_id;
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -316,27 +327,47 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &task_storage_map_btf_id,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e16dafeb2450..0de8fc8a0e0b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -19,12 +19,19 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/overflow.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
#include <net/sock.h>
+#include <net/xdp.h>
#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
@@ -198,6 +205,51 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_TRACING,
+ BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_KPROBE,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 256,
+ BTF_DTOR_KFUNC_MAX_CNT = 256,
+ BTF_KFUNC_FILTER_MAX_CNT = 16,
+};
+
+struct btf_kfunc_hook_filter {
+ btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
+ u32 nr_filters;
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+ struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
+};
+
+struct btf_id_dtor_kfunc_tab {
+ u32 cnt;
+ struct btf_id_dtor_kfunc dtors[];
+};
+
+struct btf_struct_ops_tab {
+ u32 cnt;
+ u32 capacity;
+ struct bpf_struct_ops_desc ops[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -212,6 +264,10 @@ struct btf {
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
+ struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
+ struct btf_struct_ops_tab *struct_ops_tab;
/* split BTF support */
struct btf *base_btf;
@@ -219,6 +275,7 @@ struct btf {
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};
enum verifier_phase {
@@ -284,6 +341,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
};
const char *btf_type_str(const struct btf_type *t)
@@ -307,6 +365,12 @@ const char *btf_type_str(const struct btf_type *t)
#define BTF_SHOW_NAME_SIZE 80
/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
@@ -352,7 +416,7 @@ const char *btf_type_str(const struct btf_type *t)
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
- void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
@@ -403,6 +467,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -432,15 +499,21 @@ bool btf_type_is_void(const struct btf_type *t)
return t == &btf_void;
}
-static bool btf_type_is_fwd(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+}
+
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
static bool btf_type_nosize(const struct btf_type *t)
{
return btf_type_is_void(t) || btf_type_is_fwd(t) ||
- btf_type_is_func(t) || btf_type_is_func_proto(t);
+ btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+ btf_type_is_decl_tag(t);
}
static bool btf_type_nosize_or_null(const struct btf_type *t)
@@ -448,32 +521,17 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
-}
-
-static bool btf_type_is_array(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
-}
-
-static bool btf_type_is_datasec(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
-}
-
-static bool btf_type_is_decl_tag(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
-}
-
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
return btf_type_is_func(t) || btf_type_is_struct(t) ||
btf_type_is_var(t) || btf_type_is_typedef(t);
}
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -506,6 +564,51 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_find_btf_id);
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -579,6 +682,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -593,6 +697,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
return true;
}
@@ -638,6 +743,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
return (const struct btf_decl_tag *)(t + 1);
}
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -655,18 +765,17 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
return offset < btf->hdr.str_len;
}
-static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+static bool __btf_name_char_ok(char c, bool first)
{
if ((first ? !isalpha(c) :
!isalnum(c)) &&
c != '_' &&
- ((c == '.' && !dot_ok) ||
- c != '.'))
+ c != '.')
return false;
return true;
}
-static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
@@ -678,20 +787,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
- if (!__btf_name_char_ok(*src, true, dot_ok))
+ if (!__btf_name_char_ok(*src, true))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!__btf_name_char_ok(*src, false, dot_ok))
+ if (!__btf_name_char_ok(*src, false))
return false;
src++;
}
@@ -699,17 +808,25 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
return !*src;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
-{
- return __btf_name_valid(btf, offset, false);
-}
-
+/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, true);
+ /* offset must be valid */
+ const char *src = btf_str_by_offset(btf, offset);
+ const char *src_limit;
+
+ if (!*src)
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ while (*src && src < src_limit) {
+ if (!isprint(*src))
+ return false;
+ src++;
+ }
+
+ return !*src;
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -738,28 +855,46 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
- * Regular int is not a bit field and it must be either
- * u8/u16/u32/u64 or __int128.
+ * Check that the type @t is a regular int. This means that @t is not
+ * a bit field and it has the same size as either of u8/u16/u32/u64
+ * or __int128. If @expected_size is not zero, then size of @t should
+ * be the same. A caller should already have checked that the type @t
+ * is an integer.
*/
+static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+ return BITS_PER_BYTE_MASKED(nr_bits) == 0 &&
+ BTF_INT_OFFSET(int_data) == 0 &&
+ (nr_bytes <= 16 && is_power_of_2(nr_bytes)) &&
+ (expected_size == 0 || nr_bytes == expected_size);
+}
+
static bool btf_type_int_is_regular(const struct btf_type *t)
{
- u8 nr_bits, nr_bytes;
- u32 int_data;
+ return __btf_type_int_is_regular(t, 0);
+}
+
+bool btf_type_is_i32(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4);
+}
- int_data = btf_type_int(t);
- nr_bits = BTF_INT_BITS(int_data);
- nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
- if (BITS_PER_BYTE_MASKED(nr_bits) ||
- BTF_INT_OFFSET(int_data) ||
- (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
- nr_bytes != (2 * sizeof(u64)))) {
- return false;
- }
+bool btf_type_is_i64(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
+}
- return true;
+bool btf_type_is_primitive(const struct btf_type *t)
+{
+ return (btf_type_is_int(t) && btf_type_int_is_regular(t)) ||
+ btf_is_any_enum(t);
}
/*
@@ -946,6 +1081,7 @@ static const char *btf_show_name(struct btf_show *show)
parens = "{";
break;
case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
prefix = "enum";
break;
default:
@@ -1035,7 +1171,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
*/
#define btf_show_type_value(show, fmt, value) \
do { \
- if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
show->state.depth == 0) { \
btf_show(show, "%s%s" fmt "%s%s", \
btf_show_indent(show), \
@@ -1314,23 +1451,28 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -1365,8 +1507,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -1531,12 +1680,78 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook;
+
+ if (!tab)
+ return;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+ if (!tab)
+ return;
+ kfree(tab);
+ btf->dtor_kfunc_tab = NULL;
+}
+
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++)
+ btf_record_free(tab->types[i].record);
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
+static void btf_free_struct_ops_tab(struct btf *btf)
+{
+ struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
+ u32 i;
+
+ if (!tab)
+ return;
+
+ for (i = 0; i < tab->cnt; i++)
+ bpf_struct_ops_desc_release(&tab->ops[i]);
+
+ kfree(tab);
+ btf->struct_ops_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
+ btf_free_dtor_kfunc_tab(btf);
+ btf_free_kfunc_set_tab(btf);
+ btf_free_struct_ops_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
- kvfree(btf->data);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
kfree(btf);
}
@@ -1547,6 +1762,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
+const char *btf_get_name(const struct btf *btf)
+{
+ return btf->name;
+}
+
void btf_get(struct btf *btf)
{
refcount_inc(&btf->refcnt);
@@ -1560,6 +1780,23 @@ void btf_put(struct btf *btf)
}
}
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -1728,6 +1965,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
size = type->size;
goto resolved;
@@ -2356,7 +2594,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
+ if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
@@ -2505,7 +2743,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -2584,7 +2822,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "type_id=%u", t->type);
}
-static struct btf_kind_operations modifier_ops = {
+static const struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
@@ -2593,7 +2831,7 @@ static struct btf_kind_operations modifier_ops = {
.show = btf_modifier_show,
};
-static struct btf_kind_operations ptr_ops = {
+static const struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
@@ -2634,7 +2872,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
-static struct btf_kind_operations fwd_ops = {
+static const struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
@@ -2885,7 +3123,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t,
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations array_ops = {
+static const struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
@@ -3014,7 +3252,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -3077,90 +3315,852 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+enum {
+ BTF_FIELD_IGNORE = 0,
+ BTF_FIELD_FOUND = 1,
+};
+
+struct btf_field_info {
+ enum btf_field_type type;
+ u32 off;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } graph_root;
+ };
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ info->type = field_type;
+ info->off = off;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info, u32 field_mask)
+{
+ enum btf_field_type type;
+ const char *tag_value;
+ bool is_type_tag;
+ u32 res_id;
+
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
+ /* For PTR, sz is always == 8 */
+ if (!btf_type_is_ptr(t))
+ return BTF_FIELD_IGNORE;
+ t = btf_type_by_id(btf, t->type);
+ is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t);
+ if (!is_type_tag)
+ return BTF_FIELD_IGNORE;
+ /* Reject extra tags */
+ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+ return -EINVAL;
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (!strcmp("kptr_untrusted", tag_value))
+ type = BPF_KPTR_UNREF;
+ else if (!strcmp("kptr", tag_value))
+ type = BPF_KPTR_REF;
+ else if (!strcmp("percpu_kptr", tag_value))
+ type = BPF_KPTR_PERCPU;
+ else if (!strcmp("uptr", tag_value))
+ type = BPF_UPTR;
+ else
+ return -EINVAL;
+
+ if (!(type & field_mask))
+ return BTF_FIELD_IGNORE;
+
+ /* Get the base type */
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ /* Only pointer to struct is allowed */
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
+ return BTF_FIELD_FOUND;
+}
+
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key, int last_id)
+{
+ int len = strlen(tag_key);
+ int i, n;
+
+ for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type))
+ continue;
+ if (btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return i;
+ }
+ return -ENOENT;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ const char *value = NULL;
+ const struct btf_type *t;
+ int len, id;
+
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ if (id < 0)
+ return ERR_PTR(id);
+
+ t = btf_type_by_id(btf, id);
+ len = strlen(tag_key);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
+
+ /* Prevent duplicate entries for same type */
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
+ if (id >= 0)
+ return ERR_PTR(-EEXIST);
+
+ return value;
+}
+
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
+{
+ const char *node_field_name;
+ const char *value_type;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (IS_ERR(value_type))
+ return -EINVAL;
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
+ return -EINVAL;
+ value_type = kstrndup(value_type, node_field_name - value_type,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ node_field_name++;
+ if (str_is_empty(node_field_name))
+ return -EINVAL;
+ info->type = head_type;
+ info->off = off;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask, int *align, int *sz)
+{
+ const struct {
+ enum btf_field_type type;
+ const char *const name;
+ const bool is_unique;
+ } field_types[] = {
+ { BPF_SPIN_LOCK, "bpf_spin_lock", true },
+ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
+ { BPF_TIMER, "bpf_timer", true },
+ { BPF_WORKQUEUE, "bpf_wq", true },
+ { BPF_TASK_WORK, "bpf_task_work", true },
+ { BPF_LIST_HEAD, "bpf_list_head", false },
+ { BPF_LIST_NODE, "bpf_list_node", false },
+ { BPF_RB_ROOT, "bpf_rb_root", false },
+ { BPF_RB_NODE, "bpf_rb_node", false },
+ { BPF_REFCOUNT, "bpf_refcount", false },
+ };
+ int type = 0, i;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
+ const char *field_type_name;
+ enum btf_field_type field_type;
+ bool is_unique;
+
+ for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
+ field_type = field_types[i].type;
+ field_type_name = field_types[i].name;
+ is_unique = field_types[i].is_unique;
+ if (!(field_mask & field_type) || strcmp(name, field_type_name))
+ continue;
+ if (is_unique) {
+ if (*seen_mask & field_type)
+ return -E2BIG;
+ *seen_mask |= field_type;
+ }
+ type = field_type;
+ goto end;
+ }
+
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* The type of struct size or variable size is u32,
+ * so the multiplication will not overflow.
+ */
+ if (field_cnt * (repeat_cnt + 1) > info_cnt)
+ return -E2BIG;
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
+{
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
+ struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ case BPF_TASK_WORK:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp, field_mask);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (!info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
const struct btf_member *member;
- u32 i, off = -ENOENT;
+ u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (!__btf_type_is_struct(member_type))
- continue;
- if (member_type->size != sz)
- continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
- continue;
- if (off != -ENOENT)
- /* only one such field is allowed */
- return -E2BIG;
+
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % align)
- return -EINVAL;
+
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
- return off;
+ return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
{
+ int ret, idx = 0;
const struct btf_var_secinfo *vsi;
- u32 i, off = -ENOENT;
+ u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- if (!__btf_type_is_struct(var_type))
- continue;
- if (var_type->size != sz)
- continue;
- if (vsi->size != sz)
- continue;
- if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
- continue;
- if (off != -ENOENT)
- /* only one such field is allowed */
- return -E2BIG;
off = vsi->offset;
- if (off % align)
- return -EINVAL;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
- return off;
+ return idx;
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
-
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, name, sz, align);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, name, sz, align);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
return -EINVAL;
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+/* Callers have to ensure the life cycle of btf if it is program BTF */
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
+ int ret;
+ s32 id;
+
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ goto found_dtor;
+ }
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+found_dtor:
+ field->kptr.btf_id = id;
+ field->kptr.btf = kptr_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kptr_btf);
+ return ret;
+}
+
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
{
- return btf_find_field(btf, t, "bpf_spin_lock",
- sizeof(struct bpf_spin_lock),
- __alignof__(struct bpf_spin_lock));
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
+
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % node_type_align)
+ return -EINVAL;
+
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
+ }
+ if (!n)
+ return -ENOENT;
+ return 0;
+}
+
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
+}
+
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
+{
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
+ struct btf_record *rec;
+ int ret, i, cnt;
+
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (!ret)
+ return NULL;
+
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+
+ rec->spin_lock_off = -EINVAL;
+ rec->res_spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ rec->wq_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
+ rec->task_work_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + field_type_size;
+
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_RES_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->res_spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_WORKQUEUE:
+ WARN_ON_ONCE(rec->wq_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->wq_off = rec->fields[i].offset;
+ break;
+ case BPF_TASK_WORK:
+ WARN_ON_ONCE(rec->task_work_off >= 0);
+ rec->task_work_off = rec->fields[i].offset;
+ break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ ret = -EFAULT;
+ goto end;
+ }
+ rec->cnt++;
+ }
+
+ if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) &&
+ (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
}
-int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
- return btf_find_field(btf, t, "bpf_timer",
- sizeof(struct bpf_timer),
- __alignof__(struct bpf_timer));
+ int i;
+
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ const struct btf_type *t;
+ u32 btf_id;
+
+ if (rec->fields[i].type == BPF_UPTR) {
+ /* The uptr only supports pinning one page and cannot
+ * point to a kernel struct
+ */
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ return -EINVAL;
+ t = btf_type_by_id(rec->fields[i].kptr.btf,
+ rec->fields[i].kptr.btf_id);
+ if (!t->size)
+ return -EINVAL;
+ if (t->size > PAGE_SIZE)
+ return -E2BIG;
+ continue;
+ }
+
+ if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
+ continue;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].graph_root.value_rec = meta->record;
+
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & BPF_GRAPH_NODE))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_{list,rb}_node. Let's call these node types.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_{list_head,rb_root}. Let's call these root types.
+ *
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A is an root type but not a node, its ownership
+ * chain can be:
+ * A -> B -> C
+ * Where:
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
+ *
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
+ * A -> B
+ * Where:
+ * - A is both an root and node.
+ * - B is only an node.
+ */
+ if (meta->record->field_mask & BPF_GRAPH_ROOT)
+ return -ELOOP;
+ }
+ return 0;
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -3244,7 +4244,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations struct_ops = {
+static const struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
@@ -3319,6 +4319,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
+ const char *fmt_str;
u16 i, nr_enums;
u32 meta_needed;
@@ -3332,11 +4333,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
@@ -3367,7 +4363,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
if (env->log.level == BPF_LOG_KERNEL)
continue;
- btf_verifier_log(env, "\t%s val=%d\n",
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -3408,11 +4405,14 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
return;
}
- btf_show_type_value(show, "%d", v);
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
btf_show_end_type(show);
}
-static struct btf_kind_operations enum_ops = {
+static const struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -3421,6 +4421,109 @@ static struct btf_kind_operations enum_ops = {
.show = btf_enum_show,
};
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static const struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
+};
+
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -3490,7 +4593,7 @@ done:
btf_verifier_log(env, ")");
}
-static struct btf_kind_operations func_proto_ops = {
+static const struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
@@ -3533,9 +4636,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
-static struct btf_kind_operations func_ops = {
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
+static const struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
@@ -3567,7 +4685,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off, true)) {
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
@@ -3686,6 +4804,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
u16 i;
+ env->resolve_mode = RESOLVE_TBD;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
u32 var_type_id = vsi->type, type_id, type_size = 0;
const struct btf_type *var_type = btf_type_by_id(env->btf,
@@ -3859,11 +4978,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx < -1) {
btf_verifier_log_type(env, t, "Invalid component_idx");
@@ -3959,6 +5073,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_is_resolve_source_only(ret_type)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
if (btf_type_needs_resolve(ret_type) &&
!env_type_is_resolved(env, ret_type_id)) {
err = btf_resolve(env, ret_type, ret_type_id);
@@ -3986,7 +5105,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -3995,8 +5113,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -4004,25 +5126,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -4072,6 +5192,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
[BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4156,7 +5277,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
- if (btf_type_is_decl_tag(t))
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
@@ -4246,12 +5367,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -4382,13 +5497,11 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -4440,44 +5553,237 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
+ return btf_check_sec_info(env, btf_data_size);
+}
+
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
+ "bpf_refcount",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ struct btf_id_set *new_aof;
+
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
+ }
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
+ }
+
+ if (!aof->cnt) {
+ kfree(aof);
+ return NULL;
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
+
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(aof, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ tab->cnt++;
+ }
+ kfree(aof);
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
+static int btf_check_type_tags(struct btf_verifier_env *env,
+ struct btf *btf, int start_id)
+{
+ int i, n, good_id = start_id - 1;
+ bool in_tags;
+
+ n = btf_nr_types(btf);
+ for (i = start_id; i < n; i++) {
+ const struct btf_type *t;
+ int chain_limit = 32;
+ u32 cur_id = i;
+
+ t = btf_type_by_id(btf, i);
+ if (!t)
+ return -EINVAL;
+ if (!btf_type_is_modifier(t))
+ continue;
+
+ cond_resched();
+
+ in_tags = btf_type_is_type_tag(t);
+ while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
+ if (btf_type_is_type_tag(t)) {
+ if (!in_tags) {
+ btf_verifier_log(env, "Type tags don't precede modifiers");
+ return -EINVAL;
+ }
+ } else if (in_tags) {
+ in_tags = false;
+ }
+ if (cur_id <= good_id)
+ break;
+ /* Move to next type */
+ cur_id = t->type;
+ t = btf_type_by_id(btf, cur_id);
+ if (!t)
+ return -EINVAL;
+ }
+ good_id = i;
+ }
return 0;
}
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -4486,16 +5792,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -4514,24 +5820,51 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
goto errout;
+
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
}
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
return ERR_PTR(err);
}
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
@@ -4564,22 +5897,70 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
- const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
- const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
- if (!conv_struct) {
- bpf_log(log, "btf_vmlinux is malformed\n");
+ if (!conv_struct)
return NULL;
- }
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return -EFAULT;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct sk_buff'
+ */
+ return ctx_type->type;
+}
+
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to
+ * check before we skip all the typedef below.
+ */
+ if (prog_type == BPF_PROG_TYPE_KPROBE) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return true;
+ }
+ }
+
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_struct(t)) {
@@ -4588,28 +5969,30 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- return NULL;
+ return false;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
- return NULL;
+ return false;
}
- /* prog_type is valid bpf program type. No need for bounds check. */
- ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
- /* ctx_struct is a pointer to prog_ctx_type in vmlinux.
- * Like 'struct __sk_buff'
- */
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
- if (!ctx_struct)
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
- return NULL;
- ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+ return false;
+ }
+again:
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
- return NULL;
+ return false;
}
+ /* program types without named context types work only with arg:ctx tag */
+ if (ctx_tname[0] == '\0')
+ return false;
/* only compare that prog's ctx type name is the same as
* kernel expects. No need to compare field by field.
* It's ok for bpf prog to do:
@@ -4617,41 +6000,159 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, tname))
- return NULL;
- return ctx_type;
+ if (btf_is_projection_of(ctx_tname, tname))
+ return true;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_type))
+ return false;
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+ goto again;
+ }
+ return true;
}
-static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
-#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
-#define BPF_LINK_TYPE(_id, _name)
-#define BPF_MAP_TYPE(_id, _ops) \
- [_id] = &_ops,
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_LINK_TYPE
-#undef BPF_MAP_TYPE
-};
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
-static int btf_vmlinux_map_ids_init(const struct btf *btf,
- struct bpf_verifier_log *log)
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int arg,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
{
- const struct bpf_map_ops *ops;
- int i, btf_id;
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
- for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
- ops = btf_vmlinux_map_ops[i];
- if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
- continue;
- if (!ops->map_btf_name || !ops->map_btf_id) {
- bpf_log(log, "map type %d is misconfigured\n", i);
- return -EINVAL;
+ if (!btf_is_ptr(t)) {
+ bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return 0;
}
- btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
- BTF_KIND_STRUCT);
- if (btf_id < 0)
- return btf_id;
- *ops->map_btf_id = btf_id;
+ }
+
+ /* all other program types don't use typedefs for context type */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* `void *ctx __arg_ctx` is always valid */
+ if (btf_type_is_void(t))
+ return 0;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(tname)) {
+ bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+ return -EINVAL;
+ }
+
+ /* special cases */
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (attach_type) {
+ case BPF_TRACE_RAW_TP:
+ /* tp_btf program is TRACING, so need special case here */
+ if (__btf_type_is_struct(t) &&
+ strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+ return 0;
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_TRACE_ITER:
+ /* allow struct bpf_iter__xxx types only */
+ if (__btf_type_is_struct(t) &&
+ strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+ return 0;
+ break;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ return 0; /* anything goes */
+ default:
+ break;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ /* should not happen */
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ /* resolve typedefs and check that underlying structs are matching as well */
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+ /* if program type doesn't have distinctly named struct type for
+ * context, then __arg_ctx argument can only be `void *`, which we
+ * already checked above
+ */
+ if (!__btf_type_is_struct(ctx_type)) {
+ bpf_log(log, "arg#%d should be void pointer\n", arg);
+ return -EINVAL;
+ }
+
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+ bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+ return -EINVAL;
}
return 0;
@@ -4663,31 +6164,41 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
enum bpf_prog_type prog_type,
int arg)
{
- const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
- if (!prog_ctx_type)
+ if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
- kern_ctx_type = prog_ctx_type + 1;
- return kern_ctx_type->type;
+ return find_kern_ctx_type_id(prog_type);
}
-BTF_ID_LIST(bpf_ctx_convert_btf_id)
-BTF_ID(struct, bpf_ctx_convert)
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
-struct btf *btf_parse_vmlinux(void)
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
+BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert)
+
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
{
- struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
- env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
- if (!env)
- return ERR_PTR(-ENOMEM);
-
- log = &env->log;
- log->level = BPF_LOG_KERNEL;
+ if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ return ERR_PTR(-ENOENT);
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -4696,10 +6207,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = __start_BTF;
- btf->data_size = __stop_BTF - __start_BTF;
+ btf->data = data;
+ btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "vmlinux");
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
err = btf_parse_hdr(env);
if (err)
@@ -4715,27 +6226,15 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
-
- /* find bpf map structs for map_ptr access checking */
- err = btf_vmlinux_map_ids_init(btf, log);
- if (err < 0)
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
goto errout;
- bpf_struct_ops_init(btf, log);
-
refcount_set(&btf->refcnt, 1);
- err = btf_alloc_id(btf);
- if (err)
- goto errout;
-
- btf_verifier_env_free(env);
return btf;
errout:
- btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
@@ -4743,19 +6242,61 @@ errout:
return ERR_PTR(err);
}
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
- struct btf *btf = NULL, *base_btf;
- int err;
+ int err = 0;
- base_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(base_btf))
- return base_btf;
- if (!base_btf)
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -4765,6 +6306,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
log = &env->log;
log->level = BPF_LOG_KERNEL;
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -4778,12 +6329,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
- btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
- memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
@@ -4800,12 +6350,26 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+ if (err)
+ goto errout;
+
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
+ btf_free(base_btf);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
@@ -4826,18 +6390,274 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ return btf_type_is_void(t) || btf_type_is_int(t);
+}
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
- return btf_type_is_int(t);
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+ u32 arg_no)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ int off = 0, i;
+ u32 sz;
+
+ args = btf_params(func_proto);
+ for (i = 0; i < arg_no; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ t = btf_resolve_size(btf, t, &sz);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ off += roundup(sz, 8);
+ }
+
+ return off;
}
+struct bpf_raw_tp_null_args {
+ const char *func;
+ u64 mask;
+};
+
+static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
+ /* sched */
+ { "sched_pi_setprio", 0x10 },
+ /* ... from sched_numa_pair_template event class */
+ { "sched_stick_numa", 0x100 },
+ { "sched_swap_numa", 0x100 },
+ /* afs */
+ { "afs_make_fs_call", 0x10 },
+ { "afs_make_fs_calli", 0x10 },
+ { "afs_make_fs_call1", 0x10 },
+ { "afs_make_fs_call2", 0x10 },
+ { "afs_protocol_error", 0x1 },
+ { "afs_flock_ev", 0x10 },
+ /* cachefiles */
+ { "cachefiles_lookup", 0x1 | 0x200 },
+ { "cachefiles_unlink", 0x1 },
+ { "cachefiles_rename", 0x1 },
+ { "cachefiles_prep_read", 0x1 },
+ { "cachefiles_mark_active", 0x1 },
+ { "cachefiles_mark_failed", 0x1 },
+ { "cachefiles_mark_inactive", 0x1 },
+ { "cachefiles_vfs_error", 0x1 },
+ { "cachefiles_io_error", 0x1 },
+ { "cachefiles_ondemand_open", 0x1 },
+ { "cachefiles_ondemand_copen", 0x1 },
+ { "cachefiles_ondemand_close", 0x1 },
+ { "cachefiles_ondemand_read", 0x1 },
+ { "cachefiles_ondemand_cread", 0x1 },
+ { "cachefiles_ondemand_fd_write", 0x1 },
+ { "cachefiles_ondemand_fd_release", 0x1 },
+ /* ext4, from ext4__mballoc event class */
+ { "ext4_mballoc_discard", 0x10 },
+ { "ext4_mballoc_free", 0x10 },
+ /* fib */
+ { "fib_table_lookup", 0x100 },
+ /* filelock */
+ /* ... from filelock_lock event class */
+ { "posix_lock_inode", 0x10 },
+ { "fcntl_setlk", 0x10 },
+ { "locks_remove_posix", 0x10 },
+ { "flock_lock_inode", 0x10 },
+ /* ... from filelock_lease event class */
+ { "break_lease_noblock", 0x10 },
+ { "break_lease_block", 0x10 },
+ { "break_lease_unblock", 0x10 },
+ { "generic_delete_lease", 0x10 },
+ { "time_out_leases", 0x10 },
+ /* host1x */
+ { "host1x_cdma_push_gather", 0x10000 },
+ /* huge_memory */
+ { "mm_khugepaged_scan_pmd", 0x10 },
+ { "mm_collapse_huge_page_isolate", 0x1 },
+ { "mm_khugepaged_scan_file", 0x10 },
+ { "mm_khugepaged_collapse_file", 0x10 },
+ /* kmem */
+ { "mm_page_alloc", 0x1 },
+ { "mm_page_pcpu_drain", 0x1 },
+ /* .. from mm_page event class */
+ { "mm_page_alloc_zone_locked", 0x1 },
+ /* netfs */
+ { "netfs_failure", 0x10 },
+ /* power */
+ { "device_pm_callback_start", 0x10 },
+ /* qdisc */
+ { "qdisc_dequeue", 0x1000 },
+ /* rxrpc */
+ { "rxrpc_recvdata", 0x1 },
+ { "rxrpc_resend", 0x10 },
+ { "rxrpc_tq", 0x10 },
+ { "rxrpc_client", 0x1 },
+ /* skb */
+ {"kfree_skb", 0x1000},
+ /* sunrpc */
+ { "xs_stream_read_data", 0x1 },
+ /* ... from xprt_cong_event event class */
+ { "xprt_reserve_cong", 0x10 },
+ { "xprt_release_cong", 0x10 },
+ { "xprt_get_cong", 0x10 },
+ { "xprt_put_cong", 0x10 },
+ /* tcp */
+ { "tcp_send_reset", 0x11 },
+ { "tcp_sendmsg_locked", 0x100 },
+ /* tegra_apb_dma */
+ { "tegra_dma_tx_status", 0x100 },
+ /* timer_migration */
+ { "tmigr_update_events", 0x1 },
+ /* writeback, from writeback_folio_template event class */
+ { "writeback_dirty_folio", 0x10 },
+ { "folio_wait_writeback", 0x10 },
+ /* rdma */
+ { "mr_integ_alloc", 0x2000 },
+ /* bpf_testmod */
+ { "bpf_testmod_test_read", 0x0 },
+ /* amdgpu */
+ { "amdgpu_vm_bo_map", 0x1 },
+ { "amdgpu_vm_bo_unmap", 0x1 },
+ /* netfs */
+ { "netfs_folioq", 0x1 },
+ /* xfs from xfs_defer_pending_class */
+ { "xfs_defer_create_intent", 0x1 },
+ { "xfs_defer_cancel_list", 0x1 },
+ { "xfs_defer_pending_finish", 0x1 },
+ { "xfs_defer_pending_abort", 0x1 },
+ { "xfs_defer_relog_intent", 0x1 },
+ { "xfs_defer_isolate_paused", 0x1 },
+ { "xfs_defer_item_pause", 0x1 },
+ { "xfs_defer_item_unpause", 0x1 },
+ /* xfs from xfs_defer_pending_item_class */
+ { "xfs_defer_add_item", 0x1 },
+ { "xfs_defer_cancel_item", 0x1 },
+ { "xfs_defer_finish_item", 0x1 },
+ /* xfs from xfs_icwalk_class */
+ { "xfs_ioc_free_eofblocks", 0x10 },
+ { "xfs_blockgc_free_space", 0x10 },
+ /* xfs from xfs_btree_cur_class */
+ { "xfs_btree_updkeys", 0x100 },
+ { "xfs_btree_overlapped_query_range", 0x100 },
+ /* xfs from xfs_imap_class*/
+ { "xfs_map_blocks_found", 0x10000 },
+ { "xfs_map_blocks_alloc", 0x10000 },
+ { "xfs_iomap_alloc", 0x1000 },
+ { "xfs_iomap_found", 0x1000 },
+ /* xfs from xfs_fs_class */
+ { "xfs_inodegc_flush", 0x1 },
+ { "xfs_inodegc_push", 0x1 },
+ { "xfs_inodegc_start", 0x1 },
+ { "xfs_inodegc_stop", 0x1 },
+ { "xfs_inodegc_queue", 0x1 },
+ { "xfs_inodegc_throttle", 0x1 },
+ { "xfs_fs_sync_fs", 0x1 },
+ { "xfs_blockgc_start", 0x1 },
+ { "xfs_blockgc_stop", 0x1 },
+ { "xfs_blockgc_worker", 0x1 },
+ { "xfs_blockgc_flush_all", 0x1 },
+ /* xfs_scrub */
+ { "xchk_nlinks_live_update", 0x10 },
+ /* xfs_scrub from xchk_metapath_class */
+ { "xchk_metapath_lookup", 0x100 },
+ /* nfsd */
+ { "nfsd_dirent", 0x1 },
+ { "nfsd_file_acquire", 0x1001 },
+ { "nfsd_file_insert_err", 0x1 },
+ { "nfsd_file_cons_err", 0x1 },
+ /* nfs4 */
+ { "nfs4_setup_sequence", 0x1 },
+ { "pnfs_update_layout", 0x10000 },
+ { "nfs4_inode_callback_event", 0x200 },
+ { "nfs4_inode_stateid_callback_event", 0x200 },
+ /* nfs from pnfs_layout_event */
+ { "pnfs_mds_fallback_pg_init_read", 0x10000 },
+ { "pnfs_mds_fallback_pg_init_write", 0x10000 },
+ { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 },
+ { "pnfs_mds_fallback_read_done", 0x10000 },
+ { "pnfs_mds_fallback_write_done", 0x10000 },
+ { "pnfs_mds_fallback_read_pagelist", 0x10000 },
+ { "pnfs_mds_fallback_write_pagelist", 0x10000 },
+ /* coda */
+ { "coda_dec_pic_run", 0x10 },
+ { "coda_dec_pic_done", 0x10 },
+ /* cfg80211 */
+ { "cfg80211_scan_done", 0x11 },
+ { "rdev_set_coalesce", 0x10 },
+ { "cfg80211_report_wowlan_wakeup", 0x100 },
+ { "cfg80211_inform_bss_frame", 0x100 },
+ { "cfg80211_michael_mic_failure", 0x10000 },
+ /* cfg80211 from wiphy_work_event */
+ { "wiphy_work_queue", 0x10 },
+ { "wiphy_work_run", 0x10 },
+ { "wiphy_work_cancel", 0x10 },
+ { "wiphy_work_flush", 0x10 },
+ /* hugetlbfs */
+ { "hugetlbfs_alloc_inode", 0x10 },
+ /* spufs */
+ { "spufs_context", 0x10 },
+ /* kvm_hv */
+ { "kvm_page_fault_enter", 0x100 },
+ /* dpu */
+ { "dpu_crtc_setup_mixer", 0x100 },
+ /* binder */
+ { "binder_transaction", 0x100 },
+ /* bcachefs */
+ { "btree_path_free", 0x100 },
+ /* hfi1_tx */
+ { "hfi1_sdma_progress", 0x1000 },
+ /* iptfs */
+ { "iptfs_ingress_postq_event", 0x1000 },
+ /* neigh */
+ { "neigh_update", 0x10 },
+ /* snd_firewire_lib */
+ { "amdtp_packet", 0x100 },
+};
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -4848,6 +6668,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ bool ptr_err_raw_tp = false;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -4856,7 +6678,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = btf_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -4877,6 +6699,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
@@ -4906,7 +6732,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -4925,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -4933,7 +6759,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
+ return false;
+ }
+
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
return false;
}
@@ -4951,14 +6783,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
- if (t->type == 0)
- /* This is a pointer to void.
- * It is the same as scalar from the verifier safety pov.
- * No further pointer walking is allowed.
- */
- return true;
-
- if (is_int_ptr(btf, t))
+ /*
+ * If it's a pointer to void, it's the same as scalar from the verifier
+ * safety POV. Either way, no futher pointer walking is allowed.
+ */
+ if (is_void_or_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -4972,13 +6801,53 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = ctx_arg_info->reg_type;
- info->btf = btf_vmlinux;
+ info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
+ info->ref_obj_id = ctx_arg_info->ref_obj_id;
return true;
}
}
info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ struct btf *btf = prog->aux->attach_btf;
+ const struct btf_type *t;
+ const char *tname;
+
+ /* BTF lookups cannot fail, return false on error */
+ t = btf_type_by_id(btf, prog->aux->attach_btf_id);
+ if (!t)
+ return false;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return false;
+ /* Checked by bpf_check_attach_target */
+ tname += sizeof("btf_trace_") - 1;
+ for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) {
+ /* Is this a func with potential NULL args? */
+ if (strcmp(tname, raw_tp_null_args[i].func))
+ continue;
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
+ info->reg_type |= PTR_MAYBE_NULL;
+ /* Is the current arg IS_ERR? */
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
+ ptr_err_raw_tp = true;
+ break;
+ }
+ /* If we don't know NULL-ness specification and the tracepoint
+ * is coming from a loadable module, be conservative and mark
+ * argument as PTR_MAYBE_NULL.
+ */
+ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf))
+ info->reg_type |= PTR_MAYBE_NULL;
+ }
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -5000,6 +6869,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -5008,33 +6886,47 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
+
+ /* Perform all checks on the validity of type for this argument, but if
+ * we know it can be IS_ERR at runtime, scrub pointer type and mark as
+ * scalar.
+ */
+ if (ptr_err_raw_tp) {
+ bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg);
+ info->reg_type = SCALAR_VALUE;
+ }
return true;
}
+EXPORT_SYMBOL_GPL(btf_ctx_access);
enum bpf_struct_walk_result {
/* < 0 error */
WALK_SCALAR = 0,
WALK_PTR,
+ WALK_PTR_UNTRUSTED,
WALK_STRUCT,
};
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
+ const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
again:
+ if (btf_type_is_modifier(t))
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
@@ -5042,6 +6934,14 @@ again:
}
vlen = btf_type_vlen(t);
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
@@ -5065,11 +6965,13 @@ again:
if (off < moff)
goto error;
- /* Only allow structure for now, can be relaxed for
- * other types later.
- */
+ /* allow structure and integer */
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
if (!btf_type_is_struct(t))
goto error;
@@ -5215,7 +7117,8 @@ error:
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -5224,11 +7127,32 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
+ }
+
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
return WALK_PTR;
}
+
+ return WALK_PTR_UNTRUSTED;
}
/* Allow more flexible access within an int as long as
@@ -5237,7 +7161,7 @@ error:
* that also allows using an array of int as a scratch
* space. e.g. skb->cb[].
*/
- if (off + size > mtrue_end) {
+ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
bpf_log(log,
"access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
mname, mtrue_end, tname, off, size);
@@ -5250,24 +7174,60 @@ error:
return -EINVAL;
}
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id)
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
+ const struct btf *btf = reg->btf;
+ enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
int err;
- u32 id;
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + field->size && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
do {
- err = btf_struct_walk(log, btf, t, off, size, &id);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
switch (err) {
case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
/* If we found the pointer or scalar on t+off,
* we're done.
*/
*next_btf_id = id;
+ *flag = tmp_flag;
return PTR_TO_BTF_ID;
+ case WALK_PTR_UNTRUSTED:
+ *flag = MEM_RDONLY | PTR_UNTRUSTED;
+ return PTR_TO_MEM;
case WALK_SCALAR:
return SCALAR_VALUE;
case WALK_STRUCT:
@@ -5296,8 +7256,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
- const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
@@ -5308,20 +7268,27 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1,
bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *btf, u32 id, int off,
- const struct btf *need_btf, u32 need_type_id)
+ const struct btf *need_btf, u32 need_type_id,
+ bool strict)
{
const struct btf_type *type;
+ enum bpf_type_flag flag = 0;
int err;
/* Are we already done? */
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
return true;
-
+ /* In case of strict type match, we do not walk struct, the top level
+ * type match must succeed. When strict is true, off should have already
+ * been 0.
+ */
+ if (strict)
+ return false;
again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
if (err != WALK_STRUCT)
return false;
@@ -5339,29 +7306,40 @@ again:
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf_type_by_id(btf, 0);
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
@@ -5377,28 +7355,32 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
+ m->ret_flags = 0;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
@@ -5408,10 +7390,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
return -EINVAL;
}
if (ret == 0) {
@@ -5421,6 +7405,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
}
m->nr_args = nargs;
return 0;
@@ -5504,7 +7489,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
- if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
@@ -5568,277 +7553,140 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
-/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
-static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int rec)
+static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
{
- const struct btf_type *member_type;
- const struct btf_member *member;
- u32 i;
-
- if (!btf_type_is_struct(t))
- return false;
-
- for_each_member(i, t, member) {
- const struct btf_array *array;
-
- member_type = btf_type_skip_modifiers(btf, member->type, NULL);
- if (btf_type_is_struct(member_type)) {
- if (rec >= 3) {
- bpf_log(log, "max struct nesting depth exceeded\n");
- return false;
- }
- if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
- return false;
- continue;
- }
- if (btf_type_is_array(member_type)) {
- array = btf_type_array(member_type);
- if (!array->nelems)
- return false;
- member_type = btf_type_skip_modifiers(btf, array->type, NULL);
- if (!btf_type_is_scalar(member_type))
- return false;
- continue;
- }
- if (!btf_type_is_scalar(member_type))
- return false;
- }
- return true;
-}
-
-static int btf_check_func_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs,
- bool ptr_to_mem_ok)
-{
- struct bpf_verifier_log *log = &env->log;
- bool is_kfunc = btf_is_kernel(btf);
- const char *func_name, *ref_tname;
- const struct btf_type *t, *ref_t;
- const struct btf_param *args;
- u32 i, nargs, ref_id;
-
- t = btf_type_by_id(btf, func_id);
- if (!t || !btf_type_is_func(t)) {
- /* These checks were already done by the verifier while loading
- * struct bpf_func_info or in add_kfunc_call().
- */
- bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
- func_id);
- return -EFAULT;
- }
- func_name = btf_name_by_offset(btf, t->name_off);
-
- t = btf_type_by_id(btf, t->type);
- if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", func_name);
- return -EFAULT;
- }
- args = (const struct btf_param *)(t + 1);
- nargs = btf_type_vlen(t);
- if (nargs > MAX_BPF_FUNC_REG_ARGS) {
- bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
- MAX_BPF_FUNC_REG_ARGS);
- return -EINVAL;
- }
-
- /* check that BTF function arguments match actual types that the
- * verifier sees.
- */
- for (i = 0; i < nargs; i++) {
- u32 regno = i + 1;
- struct bpf_reg_state *reg = &regs[regno];
-
- t = btf_type_skip_modifiers(btf, args[i].type, NULL);
- if (btf_type_is_scalar(t)) {
- if (reg->type == SCALAR_VALUE)
- continue;
- bpf_log(log, "R%d is not a scalar\n", regno);
- return -EINVAL;
- }
-
- if (!btf_type_is_ptr(t)) {
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
-
- ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
- ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
- if (check_ptr_off_reg(env, reg, regno))
- return -EINVAL;
- } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
- const struct btf_type *reg_ref_t;
- const struct btf *reg_btf;
- const char *reg_ref_tname;
- u32 reg_ref_id;
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (reg->type == PTR_TO_BTF_ID) {
- reg_btf = reg->btf;
- reg_ref_id = reg->btf_id;
- } else {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[reg->type];
- }
+ const char *name;
- reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
- &reg_ref_id);
- reg_ref_tname = btf_name_by_offset(reg_btf,
- reg_ref_t->name_off);
- if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id)) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname,
- regno, btf_type_str(reg_ref_t),
- reg_ref_tname);
- return -EINVAL;
- }
- } else if (ptr_to_mem_ok) {
- const struct btf_type *resolve_ret;
- u32 type_size;
-
- if (is_kfunc) {
- /* Permit pointer to mem, but only when argument
- * type is pointer to scalar, or struct composed
- * (recursively) of scalars.
- */
- if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- }
+ t = btf_type_by_id(btf, t->type); /* skip PTR */
- resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
- if (IS_ERR(resolve_ret)) {
- bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(ref_t), ref_tname,
- PTR_ERR(resolve_ret));
- return -EINVAL;
- }
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
- if (check_mem_reg(env, reg, regno, type_size))
- return -EINVAL;
- } else {
- bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
- is_kfunc ? "kernel " : "", func_name, func_id);
- return -EINVAL;
- }
+ /* allow either struct or struct forward declaration */
+ if (btf_type_is_struct(t) ||
+ (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
+ name = btf_str_by_offset(btf, t->name_off);
+ return name && strcmp(name, "bpf_dynptr") == 0;
}
- return 0;
+ return false;
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
-{
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- bool is_global;
- u32 btf_id;
- int err;
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
- if (!prog->aux->func_info)
- return -EINVAL;
+static DEFINE_MUTEX(cand_cache_mutex);
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
+ const struct btf *btf, const struct btf_type *t)
+{
+ struct bpf_cand_cache *cc;
+ struct bpf_core_ctx ctx = {
+ .btf = btf,
+ .log = log,
+ };
+ u32 kern_type_id, type_id;
+ int err = 0;
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+ /* skip PTR and modifiers */
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t)) {
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
- /* Compiler optimizations can remove arguments from static functions
- * or mismatched type can be passed into a global function.
- * In such cases mark the function as unreliable from BTF point of view.
- */
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(&ctx, type_id);
+ if (IS_ERR(cc)) {
+ err = PTR_ERR(cc);
+ bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ err);
+ goto cand_cache_unlock;
+ }
+ if (cc->cnt != 1) {
+ bpf_log(log, "arg#%d reference type('%s %s') %s\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ cc->cnt == 0 ? "has no matches" : "is ambiguous");
+ err = cc->cnt == 0 ? -ENOENT : -ESRCH;
+ goto cand_cache_unlock;
+ }
+ if (btf_is_module(cc->cands[0].btf)) {
+ bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
+ err = -EOPNOTSUPP;
+ goto cand_cache_unlock;
+ }
+ kern_type_id = cc->cands[0].id;
+
+cand_cache_unlock:
+ mutex_unlock(&cand_cache_mutex);
if (err)
- prog->aux->func_info_aux[subprog].unreliable = true;
- return err;
-}
+ return err;
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs)
-{
- return btf_check_func_arg_match(env, btf, func_id, regs, true);
+ return kern_type_id;
}
-/* Convert BTF of a function into bpf_reg_state if possible
+enum btf_arg_tag {
+ ARG_TAG_CTX = BIT_ULL(0),
+ ARG_TAG_NONNULL = BIT_ULL(1),
+ ARG_TAG_TRUSTED = BIT_ULL(2),
+ ARG_TAG_UNTRUSTED = BIT_ULL(3),
+ ARG_TAG_NULLABLE = BIT_ULL(4),
+ ARG_TAG_ARENA = BIT_ULL(5),
+};
+
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - cannot convert BTF.
- * 0 - Successfully converted BTF into bpf_reg_state
- * (either PTR_TO_CTX or SCALAR_VALUE).
+ * 0 - Successfully processed BTF and constructed argument expectations.
*/
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
{
+ bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t, *ref_t;
+ const struct btf_type *t, *ref_t, *fn_t;
u32 i, nargs, btf_id;
const char *tname;
- if (!prog->aux->func_info ||
- prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
- bpf_log(log, "Verifier bug\n");
+ if (sub->args_cached)
+ return 0;
+
+ if (!prog->aux->func_info) {
+ verifier_bug(env, "func_info undefined");
return -EFAULT;
}
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id) {
+ if (!is_global) /* not fatal for static funcs */
+ return -EINVAL;
bpf_log(log, "Global functions need valid BTF\n");
return -EFAULT;
}
- t = btf_type_by_id(btf, btf_id);
- if (!t || !btf_type_is_func(t)) {
+ fn_t = btf_type_by_id(btf, btf_id);
+ if (!fn_t || !btf_type_is_func(fn_t)) {
/* These checks were already done by the verifier while loading
* struct bpf_func_info
*/
@@ -5846,20 +7694,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
subprog);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
-
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "Validating %s() func#%d...\n",
- tname, subprog);
+ tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
- bpf_log(log, "Verifier bug in function %s()\n", tname);
+ verifier_bug(env, "unreliable BTF for function %s()", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
prog_type = prog->aux->dst_prog->type;
- t = btf_type_by_id(btf, t->type);
+ t = btf_type_by_id(btf, fn_t->type);
if (!t || !btf_type_is_func_proto(t)) {
bpf_log(log, "Invalid type of function %s()\n", tname);
return -EFAULT;
@@ -5867,15 +7711,19 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int */
+ /* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -5885,41 +7733,171 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *reg = &regs[i + 1];
+ u32 tags = 0;
+ int id = 0;
+
+ /* 'arg:<tag>' decl_tag takes precedence over derivation of
+ * register type from BTF type itself
+ */
+ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
+ const struct btf_type *tag_t = btf_type_by_id(btf, id);
+ const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
+
+ /* disallow arg tags in static subprogs */
+ if (!is_global) {
+ bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
+ return -EOPNOTSUPP;
+ }
+
+ if (strcmp(tag, "ctx") == 0) {
+ tags |= ARG_TAG_CTX;
+ } else if (strcmp(tag, "trusted") == 0) {
+ tags |= ARG_TAG_TRUSTED;
+ } else if (strcmp(tag, "untrusted") == 0) {
+ tags |= ARG_TAG_UNTRUSTED;
+ } else if (strcmp(tag, "nonnull") == 0) {
+ tags |= ARG_TAG_NONNULL;
+ } else if (strcmp(tag, "nullable") == 0) {
+ tags |= ARG_TAG_NULLABLE;
+ } else if (strcmp(tag, "arena") == 0) {
+ tags |= ARG_TAG_ARENA;
+ } else {
+ bpf_log(log, "arg#%d has unsupported set of tags\n", i);
+ return -EOPNOTSUPP;
+ }
+ }
+ if (id != -ENOENT) {
+ bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
+ return id;
+ }
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- reg->type = SCALAR_VALUE;
+ if (!btf_type_is_ptr(t))
+ goto skip_pointer;
+
+ if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (tags & ~ARG_TAG_CTX) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ if ((tags & ARG_TAG_CTX) &&
+ btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
+ continue;
+ }
+ if (btf_is_dynptr_ptr(btf, t)) {
+ if (tags) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+ continue;
+ }
+ if (tags & ARG_TAG_TRUSTED) {
+ int kern_type_id;
+
+ if (tags & ARG_TAG_NONNULL) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
+ if (tags & ARG_TAG_NULLABLE)
+ sub->args[i].arg_type |= PTR_MAYBE_NULL;
+ sub->args[i].btf_id = kern_type_id;
continue;
}
- if (btf_type_is_ptr(t)) {
- if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- reg->type = PTR_TO_CTX;
+ if (tags & ARG_TAG_UNTRUSTED) {
+ struct btf *vmlinux_btf;
+ int kern_type_id;
+
+ if (tags & ~ARG_TAG_UNTRUSTED) {
+ bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) {
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ sub->args[i].mem_size = 0;
continue;
}
- t = btf_type_skip_modifiers(btf, t->type, NULL);
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ ref_t = btf_type_by_id(vmlinux_btf, kern_type_id);
+ if (!btf_type_is_struct(ref_t)) {
+ tname = __btf_name_by_offset(vmlinux_btf, t->name_off);
+ bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n",
+ i, btf_type_str(ref_t), tname);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
+ if (tags & ARG_TAG_ARENA) {
+ if (tags & ~ARG_TAG_ARENA) {
+ bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+ continue;
+ }
+ if (is_global) { /* generic user data pointer */
+ u32 mem_size;
+
+ if (tags & ARG_TAG_NULLABLE) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
- ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ ref_t = btf_resolve_size(btf, t, &mem_size);
if (IS_ERR(ref_t)) {
- bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
PTR_ERR(ref_t));
return -EINVAL;
}
- reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
- reg->id = ++env->id_gen;
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL;
+ if (tags & ARG_TAG_NONNULL)
+ sub->args[i].arg_type &= ~PTR_MAYBE_NULL;
+ sub->args[i].mem_size = mem_size;
+ continue;
+ }
+skip_pointer:
+ if (tags) {
+ bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i);
+ return -EINVAL;
+ }
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
+ sub->args[i].arg_type = ARG_ANYTHING;
continue;
}
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
+
+ sub->arg_cnt = nargs;
+ sub->args_cached = true;
+
return 0;
}
@@ -5935,8 +7913,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
-static void btf_seq_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
@@ -5969,8 +7947,8 @@ struct btf_show_snprintf {
int len; /* length we would have written */
};
-static void btf_snprintf_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
@@ -5980,7 +7958,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt,
if (len < 0) {
ssnprintf->len_left = 0;
ssnprintf->len = len;
- } else if (len > ssnprintf->len_left) {
+ } else if (len >= ssnprintf->len_left) {
/* no space, drive on to get length we would have written */
ssnprintf->len_left = 0;
ssnprintf->len += len;
@@ -6004,7 +7982,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- /* If we encontered an error, return it. */
+ /* If we encountered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
@@ -6039,15 +8017,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -6073,21 +8048,11 @@ int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
struct btf *btf_get_by_fd(int fd)
{
struct btf *btf;
- struct fd f;
-
- f = fdget(fd);
+ CLASS(fd, f)(fd);
- if (!f.file)
- return ERR_PTR(-EBADF);
-
- if (f.file->f_op != &btf_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- btf = f.file->private_data;
- refcount_inc(&btf->refcnt);
- fdput(f);
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf))
+ refcount_inc(&btf->refcnt);
return btf;
}
@@ -6188,17 +8153,9 @@ bool btf_is_module(const struct btf *btf)
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
-static int btf_id_cmp_func(const void *a, const void *b)
-{
- const int *pa = a, *pb = b;
-
- return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
- return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
@@ -6206,22 +8163,12 @@ struct btf_module {
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
+ int flags;
};
static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);
-static ssize_t
-btf_module_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- const struct btf *btf = bin_attr->private;
-
- memcpy(buf, btf->data + off, len);
- return len;
-}
-
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
@@ -6233,7 +8180,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
int err = 0;
if (mod->btf_data_size == 0 ||
- (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
goto out;
switch (op) {
@@ -6243,12 +8191,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
err = -ENOMEM;
goto out;
}
- btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
if (IS_ERR(btf)) {
- pr_warn("failed to validate module [%s] BTF: %ld\n",
- mod->name, PTR_ERR(btf));
kfree(btf_mod);
- err = PTR_ERR(btf);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
+ err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
goto out;
}
err = btf_alloc_id(btf);
@@ -6276,8 +8229,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.name = btf->name;
attr->attr.mode = 0444;
attr->size = btf->data_size;
- attr->private = btf;
- attr->read = btf_module_read;
+ attr->private = btf->data;
+ attr->read = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
@@ -6292,6 +8245,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6338,7 +8302,12 @@ struct module *btf_try_get_module(const struct btf *btf)
if (btf_mod->btf != btf)
continue;
- if (try_module_get(btf_mod->module))
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
res = btf_mod->module;
break;
@@ -6349,9 +8318,54 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
+static int check_btf_kconfigs(const struct module *module, const char *feature)
+{
+ if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register %s\n", feature);
+ return -ENOENT;
+ }
+ if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register %s\n", feature);
+ return 0;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
- struct btf *btf;
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
long ret;
if (flags)
@@ -6360,44 +8374,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
- btf = bpf_get_btf_vmlinux();
- if (IS_ERR(btf))
- return PTR_ERR(btf);
-
- ret = btf_find_by_name_kind(btf, name, kind);
- /* ret is never zero, since btf_find_by_name_kind returns
- * positive btf_id or negative error.
- */
- if (ret < 0) {
- struct btf *mod_btf;
- int id;
-
- /* If name is not found in vmlinux's BTF then search in module's BTFs */
- spin_lock_bh(&btf_idr_lock);
- idr_for_each_entry(&btf_idr, mod_btf, id) {
- if (!btf_is_module(mod_btf))
- continue;
- /* linear search could be slow hence unlock/lock
- * the IDR to avoiding holding it for too long
- */
- btf_get(mod_btf);
- spin_unlock_bh(&btf_idr_lock);
- ret = btf_find_by_name_kind(mod_btf, name, kind);
- if (ret > 0) {
- int btf_obj_fd;
-
- btf_obj_fd = __btf_new_fd(mod_btf);
- if (btf_obj_fd < 0) {
- btf_put(mod_btf);
- return btf_obj_fd;
- }
- return ret | (((u64)btf_obj_fd) << 32);
- }
- spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
}
- spin_unlock_bh(&btf_idr_lock);
+ return ret | (((u64)btf_obj_fd) << 32);
}
+ if (ret > 0)
+ btf_put(btf);
return ret;
}
@@ -6416,58 +8403,563 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
-/* BTF ID set registration API for modules */
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+ return btf_id;
+}
+
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
{
- mutex_lock(&l->mutex);
- list_add(&s->list, &l->list);
- mutex_unlock(&l->mutex);
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *sfx, *iter_name;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+ int btf_id;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ t = btf_type_by_id(btf, btf_id);
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
{
- mutex_lock(&l->mutex);
- list_del_init(&s->list);
- mutex_unlock(&l->mutex);
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
- struct module *owner)
+/* Kernel Function (kfunc) BTF ID set registration API */
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- struct kfunc_btf_id_set *s;
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *add_set = kset->set;
+ bool vmlinux_set = !btf_is_module(btf);
+ bool add_filter = !!kset->filter;
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set8 *set;
+ u32 set_cnt, i;
+ int ret;
- mutex_lock(&klist->mutex);
- list_for_each_entry(s, &klist->list, list) {
- if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
- mutex_unlock(&klist->mutex);
- return true;
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+
+ if (tab && add_filter) {
+ u32 i;
+
+ hook_filter = &tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i] == kset->filter) {
+ add_filter = false;
+ break;
+ }
+ }
+
+ if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
}
}
- mutex_unlock(&klist->mutex);
- return false;
+
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook],
+ struct_size(set, pairs, set_cnt + add_set->cnt),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook])
+ set->cnt = 0;
+ tab->sets[hook] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
+ set->cnt += add_set->cnt;
+
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+
+ if (add_filter) {
+ hook_filter = &tab->hook_filters[hook];
+ hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
+ }
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *set;
+ u32 *id, i;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
+ if (!btf->kfunc_set_tab)
+ return NULL;
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return NULL;
+ }
+ set = btf->kfunc_set_tab->sets[hook];
+ if (!set)
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_LSM:
+ return BTF_KFUNC_HOOK_TRACING;
+ case BPF_PROG_TYPE_SYSCALL:
+ return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return BTF_KFUNC_HOOK_CGROUP;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_KPROBE:
+ return BTF_KFUNC_HOOK_KPROBE;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
}
-#define DEFINE_KFUNC_BTF_ID_LIST(name) \
- struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
- __MUTEX_INITIALIZER(name.mutex) }; \
- EXPORT_SYMBOL_GPL(name)
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
+ if (kfunc_flags)
+ return kfunc_flags;
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
+}
-#endif
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
+}
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf *btf;
+ int ret, i;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf)
+ return check_btf_kconfigs(kset->owner, "kfunc");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+
+err_out:
+ btf_put(btf);
+ return ret;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ /* All kfuncs need to be tagged as such in BTF.
+ * WARN() for initcall registrations that do not check errors.
+ */
+ if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
+ WARN_ON(!kset->owner);
+ return -EINVAL;
+ }
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+ struct btf_id_dtor_kfunc *dtor;
+
+ if (!tab)
+ return -ENOENT;
+ /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+ * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+ */
+ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+ dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+ if (!dtor)
+ return -ENOENT;
+ return dtor->kfunc_btf_id;
+}
+
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+ const struct btf_type *dtor_func, *dtor_func_proto, *t;
+ const struct btf_param *args;
+ s32 dtor_btf_id;
+ u32 nr_args, i;
+
+ for (i = 0; i < cnt; i++) {
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
+
+ dtor_func = btf_type_by_id(btf, dtor_btf_id);
+ if (!dtor_func || !btf_type_is_func(dtor_func))
+ return -EINVAL;
+
+ dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+ if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+ return -EINVAL;
+
+ /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+ t = btf_type_by_id(btf, dtor_func_proto->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+
+ nr_args = btf_type_vlen(dtor_func_proto);
+ if (nr_args != 1)
+ return -EINVAL;
+ args = btf_params(dtor_func_proto);
+ t = btf_type_by_id(btf, args[0].type);
+ /* Allow any pointer type, as width on targets Linux supports
+ * will be same for all pointer types (i.e. sizeof(void *))
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+ struct module *owner)
+{
+ struct btf_id_dtor_kfunc_tab *tab;
+ struct btf *btf;
+ u32 tab_cnt, i;
+ int ret;
+
+ btf = btf_get_module_btf(owner);
+ if (!btf)
+ return check_btf_kconfigs(owner, "dtor kfuncs");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Ensure that the prototype of dtor kfuncs being registered is sane */
+ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+ if (ret < 0)
+ goto end;
+
+ tab = btf->dtor_kfunc_tab;
+ /* Only one call allowed for modules */
+ if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ tab_cnt = tab ? tab->cnt : 0;
+ if (tab_cnt > U32_MAX - add_cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+ if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ tab = krealloc(btf->dtor_kfunc_tab,
+ struct_size(tab, dtors, tab_cnt + add_cnt),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ if (!btf->dtor_kfunc_tab)
+ tab->cnt = 0;
+ btf->dtor_kfunc_tab = tab;
+
+ memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
+ tab->cnt += add_cnt;
+
+ sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+end:
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs/ENUM64s, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
- return -EOPNOTSUPP;
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
+}
+
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
}
static bool bpf_core_is_flavor_sep(const char *s)
@@ -6490,17 +8982,6 @@ size_t bpf_core_essential_name_len(const char *name)
return n;
}
-struct bpf_cand_cache {
- const char *name;
- u32 name_len;
- u16 kind;
- u16 cnt;
- struct {
- const struct btf *btf;
- u32 id;
- } cands[];
-};
-
static void bpf_free_cands(struct bpf_cand_cache *cands)
{
if (!cands->cnt)
@@ -6521,8 +9002,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
-static DEFINE_MUTEX(cand_cache_mutex);
-
static void __print_cand_cache(struct bpf_verifier_log *log,
struct bpf_cand_cache **cache,
int cache_size)
@@ -6587,7 +9066,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
bpf_free_cands_from_cache(*cc);
*cc = NULL;
}
- new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
@@ -6595,7 +9074,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
/* strdup the name, since it will stay in cache.
* the cands->name points to strings in prog's BTF and the prog can be unloaded.
*/
- new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT);
bpf_free_cands(cands);
if (!new_cands->name) {
kfree(new_cands);
@@ -6679,7 +9158,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
continue;
/* most of the time there is only one candidate for a given kind+name pair */
- new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
@@ -6710,6 +9189,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
main_btf = bpf_get_btf_vmlinux();
if (IS_ERR(main_btf))
return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
local_type = btf_type_by_id(local_btf, local_type_id);
if (!local_type)
@@ -6767,12 +9248,10 @@ check_modules:
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
- if (IS_ERR(cands)) {
- btf_put(mod_btf);
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
return ERR_CAST(cands);
- }
spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
@@ -6788,16 +9267,26 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
{
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
+ const struct btf_type *type;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
* into arrays of btf_ids of struct fields and array indices.
*/
- specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT);
if (!specs)
return -ENOMEM;
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ kfree(specs);
+ return -EINVAL;
+ }
+
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
@@ -6811,7 +9300,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
goto out;
}
if (cc->cnt) {
- cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT);
if (!cands.cands) {
err = -ENOMEM;
goto out;
@@ -6827,13 +9316,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
cands.len = cc->cnt;
/* cand_cache_mutex needs to span the cache lookup and
* copy of btf pointer into bpf_core_cand_list,
- * since module can be unloaded while bpf_core_apply_relo_insn
+ * since module can be unloaded while bpf_core_calc_relo_insn
* is working with module's btf.
*/
}
- err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
- relo, relo_idx, ctx->btf, &cands, specs);
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
out:
kfree(specs);
if (need_cands) {
@@ -6844,3 +9339,241 @@ out:
}
return err;
}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id, const char *suffix)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member;
+ u32 i;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
+ if (ret >= sizeof(safe_tname))
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+
+ btf_type_skip_modifiers(btf, mtype->type, &id);
+ /* If we match on both type and name, the field is considered trusted. */
+ if (btf_id == id && !strcmp(field_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
+
+#ifdef CONFIG_BPF_JIT
+static int
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+ struct bpf_verifier_log *log)
+{
+ struct btf_struct_ops_tab *tab, *new_tab;
+ int i, err;
+
+ tab = btf->struct_ops_tab;
+ if (!tab) {
+ tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ tab->capacity = 4;
+ btf->struct_ops_tab = tab;
+ }
+
+ for (i = 0; i < tab->cnt; i++)
+ if (tab->ops[i].st_ops == st_ops)
+ return -EEXIST;
+
+ if (tab->cnt == tab->capacity) {
+ new_tab = krealloc(tab,
+ struct_size(tab, ops, tab->capacity * 2),
+ GFP_KERNEL);
+ if (!new_tab)
+ return -ENOMEM;
+ tab = new_tab;
+ tab->capacity *= 2;
+ btf->struct_ops_tab = tab;
+ }
+
+ tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
+
+ err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+ if (err)
+ return err;
+
+ btf->struct_ops_tab->cnt++;
+
+ return 0;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!value_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].value_id == value_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!type_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].type_id == type_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err = 0;
+
+ btf = btf_get_module_btf(st_ops->owner);
+ if (!btf)
+ return check_btf_kconfigs(st_ops->owner, "struct_ops");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+ if (!log) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ log->level = BPF_LOG_KERNEL;
+
+ err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+ kfree(log);
+ btf_put(btf);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
+#endif
+
+bool btf_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len <= suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c
new file mode 100644
index 000000000000..0e2c66a52df9
--- /dev/null
+++ b/kernel/bpf/btf_iter.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_iter.c"
diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c
new file mode 100644
index 000000000000..c12ccbf66507
--- /dev/null
+++ b/kernel/bpf/btf_relocate.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_relocate.c"
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 514b4681a90a..69988af44b37 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,8 @@
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
@@ -22,7 +24,201 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
-void cgroup_bpf_offline(struct cgroup *cgrp)
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_percpu_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
+ WQ_PERCPU, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data);
+
+static struct notifier_block cgroup_bpf_lifetime_nb = {
+ .notifier_call = cgroup_bpf_lifetime_notify,
+};
+
+void __init cgroup_bpf_lifetime_notifier_init(void)
+{
+ BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &cgroup_bpf_lifetime_nb));
+}
+
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ const struct bpf_prog_array *array;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ u32 func_ret;
+
+ run_ctx.retval = retval;
+ rcu_read_lock_dont_migrate();
+ array = rcu_dereference(cgrp->effective[atype]);
+ item = &array->items[0];
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ while ((prog = READ_ONCE(item->prog))) {
+ run_ctx.prog_item = item;
+ func_ret = run_prog(prog, ctx);
+ if (ret_flags) {
+ *(ret_flags) |= (func_ret >> 1);
+ func_ret &= 1;
+ }
+ if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ run_ctx.retval = -EPERM;
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock_migrate();
+ return run_ctx.retval;
+}
+
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ cgroup_lock();
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ cgroup_unlock();
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
+static void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
percpu_ref_kill(&cgrp->bpf.refcnt);
@@ -115,18 +311,25 @@ static void cgroup_bpf_release(struct work_struct *work)
unsigned int atype;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
- struct list_head *progs = &cgrp->bpf.progs[atype];
- struct bpf_prog_list *pl, *pltmp;
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
- list_for_each_entry_safe(pl, pltmp, progs, node) {
- list_del(&pl->node);
- if (pl->prog)
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
- if (pl->link)
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
+ }
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
@@ -141,7 +344,7 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -160,7 +363,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
@@ -178,14 +381,16 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
+ if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+ (*preorder_cnt)++;
cnt++;
}
return cnt;
@@ -209,7 +414,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[atype]);
+ cnt = prog_list_length(&p->bpf.progs[atype], NULL);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -232,12 +437,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
- int cnt = 0;
+ int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
/* count number of effective programs by walking parents */
do {
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[atype]);
+ cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
p = cgroup_parent(p);
} while (p);
@@ -248,20 +453,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* populate the array with effective progs */
cnt = 0;
p = cgrp;
+ fstart = preorder_cnt;
+ bstart = preorder_cnt - 1;
do {
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[atype], node) {
+ init_bstart = bstart;
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
- item = &progs->items[cnt];
+ if (pl->flags & BPF_F_PREORDER) {
+ item = &progs->items[bstart];
+ bstart--;
+ } else {
+ item = &progs->items[fstart];
+ fstart++;
+ }
item->prog = prog_list_prog(pl);
bpf_cgroup_storages_assign(item->cgroup_storage,
pl->storage);
cnt++;
}
+
+ /* reverse pre-ordering progs at this cgroup level */
+ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+ swap(progs->items[i], progs->items[j]);
+
} while ((p = cgroup_parent(p)));
*array = progs;
@@ -284,7 +503,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
*/
-int cgroup_bpf_inherit(struct cgroup *cgrp)
+static int cgroup_bpf_inherit(struct cgroup *cgrp)
{
/* has to use marco instead of const int, since compiler thinks
* that array below is variable length
@@ -303,7 +522,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
- INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
INIT_LIST_HEAD(&cgrp->bpf.storages);
@@ -327,6 +546,27 @@ cleanup:
return -ENOMEM;
}
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ int ret = 0;
+
+ if (cgrp->root != &cgrp_dfl_root)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ ret = cgroup_bpf_inherit(cgrp);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ cgroup_bpf_offline(cgrp);
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
static int update_effective_progs(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype)
{
@@ -379,7 +619,7 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -389,12 +629,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* single-attach case */
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
return NULL;
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -405,7 +645,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -417,6 +657,116 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
return NULL;
}
+static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ return link;
+}
+
+static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ return prog;
+}
+
+static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
+{
+ bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
+ struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
+ bool preorder = flags & BPF_F_PREORDER;
+ struct bpf_link *anchor_link = NULL;
+ struct bpf_prog *anchor_prog = NULL;
+ bool is_before, is_after;
+
+ is_before = flags & BPF_F_BEFORE;
+ is_after = flags & BPF_F_AFTER;
+ if (is_link || is_id || id_or_fd) {
+ /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
+ if (is_before == is_after)
+ return ERR_PTR(-EINVAL);
+ if ((is_link && !link) || (!is_link && !prog))
+ return ERR_PTR(-EINVAL);
+ } else if (!hlist_empty(progs)) {
+ /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
+ if (is_before && is_after)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (is_link) {
+ anchor_link = bpf_get_anchor_link(flags, id_or_fd);
+ if (IS_ERR(anchor_link))
+ return ERR_CAST(anchor_link);
+ } else if (is_id || id_or_fd) {
+ anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
+ if (IS_ERR(anchor_prog))
+ return ERR_CAST(anchor_prog);
+ }
+
+ if (!anchor_prog && !anchor_link) {
+ /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
+ * doesn't matter since either prepend or append to a combined
+ * list of progs will end up with correct result.
+ */
+ hlist_for_each_entry(pltmp, progs, node) {
+ if (is_before)
+ return pltmp;
+ if (pltmp->node.next)
+ continue;
+ return pltmp;
+ }
+ return NULL;
+ }
+
+ hlist_for_each_entry(pltmp, progs, node) {
+ if ((anchor_prog && anchor_prog == pltmp->prog) ||
+ (anchor_link && anchor_link == &pltmp->link->link)) {
+ if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
+ goto out;
+ pl = pltmp;
+ goto out;
+ }
+ }
+
+ pl = ERR_PTR(-ENOENT);
+out:
+ if (anchor_link)
+ bpf_link_put(anchor_link);
+ else
+ bpf_prog_put(anchor_prog);
+ return pl;
+}
+
+static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
+ struct bpf_prog *prog, struct bpf_cgroup_link *link,
+ u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog_list *pltmp;
+
+ pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
+ if (IS_ERR(pltmp))
+ return PTR_ERR(pltmp);
+
+ if (!pltmp)
+ hlist_add_head(&pl->node, progs);
+ else if (flags & BPF_F_BEFORE)
+ hlist_add_before(&pl->node, &pltmp->node);
+ else
+ hlist_add_behind(&pl->node, &pltmp->node);
+
+ return 0;
+}
+
/**
* __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
* propagate the change to descendants
@@ -426,6 +776,8 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
+ * @id_or_fd: Relative prog id or fd
+ * @revision: bpf_prog_list revision
*
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
@@ -433,21 +785,26 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+ enum bpf_attach_type type, u32 flags, u32 id_or_fd,
+ u64 revision)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
+ if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
+ /* only either replace or insertion with before/after */
+ return -EINVAL;
if (link && (prog || replace_prog))
/* only either link or prog/replace_prog can be specified */
return -EINVAL;
@@ -455,23 +812,25 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- atype = to_cgroup_bpf_attach_type(type);
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
progs = &cgrp->bpf.progs[atype];
if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -491,25 +850,46 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- list_add_tail(&pl->node, progs);
+
+ err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
+ if (err) {
+ kfree(pl);
+ bpf_cgroup_storages_free(new_storage);
+ return err;
+ }
}
pl->prog = prog;
pl->link = link;
+ pl->flags = flags;
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
+ if (err)
+ goto cleanup;
+ }
+
err = update_effective_progs(cgrp, atype);
if (err)
- goto cleanup;
+ goto cleanup_trampoline;
- if (old_prog)
+ cgrp->bpf.revisions[atype] += 1;
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- else
+ } else {
static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
cleanup:
if (old_prog) {
pl->prog = old_prog;
@@ -517,7 +897,7 @@ cleanup:
}
bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
- list_del(&pl->node);
+ hlist_del(&pl->node);
kfree(pl);
}
return err;
@@ -527,13 +907,14 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type,
- u32 flags)
+ u32 flags, u32 id_or_fd, u64 revision)
{
int ret;
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
+ cgroup_lock();
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
+ id_or_fd, revision);
+ cgroup_unlock();
return ret;
}
@@ -548,7 +929,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
struct cgroup_subsys_state *css;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct list_head *head;
+ struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -564,7 +945,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
continue;
head = &cg->bpf.progs[atype];
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->link == link)
@@ -587,7 +968,8 @@ found:
* to descendants
* @cgrp: The cgroup which descendants to traverse
* @link: A link for which to replace BPF program
- * @type: Type of attach operation
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ * incremented
*
* Must be called with cgroup_mutex held.
*/
@@ -598,10 +980,10 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
bool found = false;
- atype = to_cgroup_bpf_attach_type(link->type);
+ atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -610,7 +992,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -619,6 +1001,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (!found)
return -ENOENT;
+ cgrp->bpf.revisions[atype] += 1;
old_prog = xchg(&link->link.prog, new_prog);
replace_effective_prog(cgrp, atype, link);
bpf_prog_put(old_prog);
@@ -633,7 +1016,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
cg_link = container_of(link, struct bpf_cgroup_link, link);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* link might have been auto-released by dying cgroup, so fail */
if (!cg_link->cgroup) {
ret = -ENOLINK;
@@ -645,11 +1028,11 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
}
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -657,14 +1040,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog_list *pl;
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* report error when trying to detach and nothing is attached */
return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -674,7 +1057,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -682,30 +1065,96 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
}
/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
+found:
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
* @link: A link to detach or NULL
* @type: Type of detach operation
+ * @revision: bpf_prog_list revision
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+ struct bpf_cgroup_link *link, enum bpf_attach_type type,
+ u64 revision)
{
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
u32 flags;
- int err;
- atype = to_cgroup_bpf_attach_type(type);
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
if (atype < 0)
return -EINVAL;
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
+
progs = &cgrp->bpf.progs[atype];
flags = cgrp->bpf.flags[atype];
@@ -722,36 +1171,38 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, atype);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
/* now can actually delete it from this cgroup list */
- list_del(&pl->node);
+ hlist_del(&pl->node);
+ cgrp->bpf.revisions[atype] += 1;
+
kfree(pl);
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
cgrp->bpf.flags[atype] = 0;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
+ }
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
-
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
}
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
+ enum bpf_attach_type type, u64 revision)
{
int ret;
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
+ cgroup_lock();
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
+ cgroup_unlock();
return ret;
}
@@ -759,57 +1210,103 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
+ enum cgroup_bpf_attach_type from_atype, to_atype;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
- struct list_head *progs;
- struct bpf_prog *prog;
int cnt, ret = 0, i;
+ int total_cnt = 0;
+ u64 revision = 0;
u32 flags;
- atype = to_cgroup_bpf_attach_type(type);
- if (atype < 0)
+ if (effective_query && prog_attach_flags)
return -EINVAL;
- progs = &cgrp->bpf.progs[atype];
- flags = cgrp->bpf.flags[atype];
+ if (type == BPF_LSM_CGROUP) {
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
+ return -EINVAL;
- effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
- lockdep_is_held(&cgroup_mutex));
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(effective);
- else
- cnt = prog_list_length(progs);
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
+ }
+ }
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
+ return -EFAULT;
+ if (!effective_query && from_atype == to_atype)
+ revision = cgrp->bpf.revisions[from_atype];
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
return -EFAULT;
- if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
- if (attr->query.prog_cnt < cnt) {
- cnt = attr->query.prog_cnt;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
ret = -ENOSPC;
}
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
- } else {
- struct bpf_prog_list *pl;
- u32 id;
-
- i = 0;
- list_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id)))
- return -EFAULT;
- if (++i == cnt)
- break;
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
}
return ret;
}
@@ -819,9 +1316,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -846,7 +1343,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
}
ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
- attr->attach_type, attr->attach_flags);
+ attr->attach_type, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
if (replace_prog)
bpf_prog_put(replace_prog);
@@ -868,7 +1366,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
if (IS_ERR(prog))
prog = NULL;
- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
if (prog)
bpf_prog_put(prog);
@@ -888,21 +1386,23 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
if (!cg_link->cgroup)
return;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* re-check cgroup under lock again */
if (!cg_link->cgroup) {
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return;
}
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
- cg_link->type));
+ link->attach_type, 0));
+ if (link->attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
cg = cg_link->cgroup;
cg_link->cgroup = NULL;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
cgroup_put(cg);
}
@@ -929,16 +1429,16 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
seq_printf(seq,
"cgroup_id:\t%llu\n"
"attach_type:\t%d\n",
cg_id,
- cg_link->type);
+ link->attach_type);
}
static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
@@ -948,13 +1448,13 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
info->cgroup.cgroup_id = cg_id;
- info->cgroup.attach_type = cg_link->type;
+ info->cgroup.attach_type = link->attach_type;
return 0;
}
@@ -967,6 +1467,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = {
.fill_link_info = bpf_cgroup_link_fill_link_info,
};
+#define BPF_F_LINK_ATTACH_MASK \
+ (BPF_F_ID | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_PREORDER | \
+ BPF_F_LINK)
+
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
@@ -974,7 +1481,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
struct cgroup *cgrp;
int err;
- if (attr->link_create.flags)
+ if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
return -EINVAL;
cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -987,9 +1494,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto out_put_cgroup;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
- prog);
+ prog, attr->link_create.attach_type);
link->cgroup = cgrp;
- link->type = attr->link_create.attach_type;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -998,7 +1504,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
- link->type, BPF_F_ALLOW_MULTI);
+ link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+ attr->link_create.cgroup.relative_fd,
+ attr->link_create.cgroup.expected_revision);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
@@ -1031,7 +1539,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1044,7 +1552,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -1054,15 +1562,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum cgroup_bpf_attach_type atype)
{
- unsigned int offset = skb->data - skb_network_header(skb);
+ unsigned int offset = -skb_network_offset(skb);
struct sock *save_sk;
void *saved_data_end;
struct cgroup *cgrp;
int ret;
- if (!sk || !sk_fullsock(sk))
- return 0;
-
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
@@ -1075,12 +1580,40 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);
if (atype == CGROUP_INET_EGRESS) {
- ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
+ u32 flags = 0;
+ bool cn;
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+ __bpf_prog_run_save_cb, 0, &flags);
+
+ /* Return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * The returned value is then converted to one of the NET_XMIT
+ * or an error code that is then interpreted as drop packet
+ * (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -err skb should be dropped
+ */
+
+ cn = flags & BPF_RET_SET_CN;
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ if (!ret)
+ ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+ else
+ ret = (cn ? NET_XMIT_DROP : ret);
} else {
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+ skb, __bpf_prog_run_save_cb, 0,
+ NULL);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1093,7 +1626,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1107,10 +1640,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+ NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1119,18 +1651,22 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ * uaddr.
+ * @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
*
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
*
* This function will return %-EPERM if an attached program is found and
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
+ int *uaddrlen,
enum cgroup_bpf_attach_type atype,
void *t_ctx,
u32 *flags)
@@ -1140,26 +1676,33 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
.uaddr = uaddr,
.t_ctx = t_ctx,
};
- struct sockaddr_storage unspec;
+ struct sockaddr_storage storage;
struct cgroup *cgrp;
int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
*/
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+ sk->sk_family != AF_UNIX)
return 0;
if (!ctx.uaddr) {
- memset(&unspec, 0, sizeof(unspec));
- ctx.uaddr = (struct sockaddr *)&unspec;
+ memset(&storage, 0, sizeof(storage));
+ ctx.uaddr = (struct sockaddr_unsized *)&storage;
+ ctx.uaddrlen = 0;
+ } else {
+ ctx.uaddrlen = *uaddrlen;
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, flags);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
- return ret == 1 ? 0 : -EPERM;
+ if (!ret && uaddr)
+ *uaddrlen = ctx.uaddrlen;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1169,7 +1712,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1184,11 +1727,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+ 0, NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1201,40 +1742,95 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
- return !allow;
+ return ret;
+}
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
}
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
+}
+
+const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -1286,7 +1882,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @type: type of program to be executed
+ * @atype: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
* can allow or deny such access.
@@ -1295,7 +1891,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* returned value != 1 during execution. In all other cases 0 is returned.
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
- struct ctl_table *table, int write,
+ const struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
enum cgroup_bpf_attach_type atype)
{
@@ -1337,7 +1933,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1350,24 +1947,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum cgroup_bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1414,7 +1997,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
}
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
- int *optname, char __user *optval,
+ int *optname, sockptr_t optval,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1426,45 +2009,42 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(*optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
ret = 1;
} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
+ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = 0;
+ goto out;
+ }
ret = -EFAULT;
} else {
/* optlen within bounds, run kernel handler */
@@ -1508,8 +2088,8 @@ out:
}
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval,
- int __user *optlen, int max_optlen,
+ int optname, sockptr_t optval,
+ sockptr_t optlen, int max_optlen,
int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1518,19 +2098,13 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
+ int orig_optlen;
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
- return retval;
-
+ orig_optlen = max_optlen;
ctx.optlen = max_optlen;
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1542,8 +2116,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* one that kernel returned as well to let
* BPF programs inspect the value.
*/
-
- if (get_user(ctx.optlen, optlen)) {
+ if (copy_from_sockptr(&ctx.optlen, optlen,
+ sizeof(ctx.optlen))) {
ret = -EFAULT;
goto out;
}
@@ -1552,47 +2126,47 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ret = -EFAULT;
goto out;
}
+ orig_optlen = ctx.optlen;
- if (copy_from_user(ctx.optval, optval,
- min(ctx.optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(ctx.optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
- goto out;
- }
-
- if (ctx.optlen > max_optlen || ctx.optlen < 0) {
- ret = -EFAULT;
+ if (ret < 0)
goto out;
- }
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
+ if (!sockptr_is_null(optval) &&
+ (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = retval;
+ goto out;
+ }
ret = -EFAULT;
goto out;
}
if (ctx.optlen != 0) {
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
+ if (!sockptr_is_null(optval) &&
+ copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
ret = -EFAULT;
goto out;
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1607,10 +2181,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1622,26 +2196,20 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
* be called if that data shouldn't be "exported".
*/
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
- if (!ret)
- return -EPERM;
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
+ if (ret < 0)
+ return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -1702,7 +2270,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -1796,11 +2364,13 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -1811,8 +2381,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_set_new_value_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -1880,10 +2452,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
- *insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg,
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
bpf_ctx_narrow_access_offset(
- 0, sizeof(u32), sizeof(loff_t)));
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1933,6 +2507,12 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -1954,8 +2534,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -1994,22 +2576,22 @@ static bool cg_sockopt_is_valid_access(int off, int size,
}
switch (off) {
- case offsetof(struct bpf_sockopt, sk):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_SOCKET;
break;
- case offsetof(struct bpf_sockopt, optval):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_PACKET;
break;
- case offsetof(struct bpf_sockopt, optval_end):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_PACKET_END;
break;
- case offsetof(struct bpf_sockopt, retval):
+ case bpf_ctx_range(struct bpf_sockopt, retval):
if (size != size_default)
return false;
return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
@@ -2021,10 +2603,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return true;
}
-#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
- T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sockopt_kern, F))
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -2036,37 +2625,68 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sockopt, sk):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
break;
case offsetof(struct bpf_sockopt, level):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
break;
case offsetof(struct bpf_sockopt, optname):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
break;
case offsetof(struct bpf_sockopt, optlen):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
break;
case offsetof(struct bpf_sockopt, optval_end):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
}
@@ -2091,3 +2711,51 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..f04a468cf6a7
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_lock();
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_unlock();
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
+ p->start_css = &cgrp->self;
+ css_get(p->start_css);
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+ struct cgroup_subsys_state *start;
+ struct cgroup_subsys_state *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start, unsigned int flags)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+ kit->start = NULL;
+ switch (flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->start = start;
+ kit->pos = NULL;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ if (!kit->start)
+ return NULL;
+
+ switch (kit->flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ kit->pos = css_next_descendant_post(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ kit->pos = kit->pos ? kit->pos->parent : kit->start;
+ }
+
+ return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..c8ae6ab31651 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -18,14 +18,15 @@
*/
#include <uapi/linux/btf.h>
+#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <linux/random.h>
-#include <linux/moduleloader.h>
+#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -33,9 +34,15 @@
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
+#include <linux/nospec.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <linux/execmem.h>
+#include <crypto/sha2.h>
#include <asm/barrier.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -57,8 +64,12 @@
#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
+#define OFF insn->off
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -67,34 +78,41 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
return NULL;
}
+/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
+enum page_size_enum {
+ __PAGE_SIZE = PAGE_SIZE
+};
+
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
- size = round_up(size, PAGE_SIZE);
+ size = round_up(size, __PAGE_SIZE);
fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -103,19 +121,32 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->main_prog_aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_prog_stream_init(fp);
+#endif
+
return fp;
}
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;
@@ -148,7 +179,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;
@@ -174,7 +205,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
* here is relative to the prog itself instead of the main prog.
* This array has one entry for each xlated bpf insn.
*
- * jited_off is the byte off to the last byte of the jited insn.
+ * jited_off is the byte off to the end of the jited insn.
*
* Hence, with
* insn_start:
@@ -198,7 +229,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const struct bpf_line_info *linfo;
void **jited_linfo;
- if (!prog->aux->jited_linfo)
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
/* Userspace did not provide linfo */
return;
@@ -223,7 +254,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
u32 pages;
@@ -265,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp)
int bpf_prog_calc_tag(struct bpf_prog *fp)
{
- const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
- u32 raw_size = bpf_prog_tag_scratch_size(fp);
- u32 digest[SHA1_DIGEST_WORDS];
- u32 ws[SHA1_WORKSPACE_WORDS];
- u32 i, bsize, psize, blocks;
+ size_t size = bpf_prog_insn_size(fp);
struct bpf_insn *dst;
bool was_ld_map;
- u8 *raw, *todo;
- __be32 *result;
- __be64 *bits;
+ u32 i;
- raw = vmalloc(raw_size);
- if (!raw)
+ dst = vmalloc(size);
+ if (!dst)
return -ENOMEM;
- sha1_init(digest);
- memset(ws, 0, sizeof(ws));
-
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
- dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
@@ -306,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
was_ld_map = false;
}
}
-
- psize = bpf_prog_insn_size(fp);
- memset(&raw[psize], 0, raw_size - psize);
- raw[psize++] = 0x80;
-
- bsize = round_up(psize, SHA1_BLOCK_SIZE);
- blocks = bsize / SHA1_BLOCK_SIZE;
- todo = raw;
- if (bsize - psize >= sizeof(__be64)) {
- bits = (__be64 *)(todo + bsize - sizeof(__be64));
- } else {
- bits = (__be64 *)(todo + bsize + bits_offset);
- blocks++;
- }
- *bits = cpu_to_be64((psize - 1) << 3);
-
- while (blocks--) {
- sha1_transform(digest, todo, ws);
- todo += SHA1_BLOCK_SIZE;
- }
-
- result = (__force __be32 *)digest;
- for (i = 0; i < SHA1_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(digest[i]);
- memcpy(fp->tag, result, sizeof(fp->tag));
-
- vfree(raw);
+ sha256((u8 *)dst, size, fp->digest);
+ vfree(dst);
return 0;
}
@@ -357,9 +353,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
s32 end_new, s32 curr, const bool probe_pass)
{
- const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s64 off_min, off_max, off;
s32 delta = end_new - end_old;
- s32 off = insn->off;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ off = insn->imm;
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
+ off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
@@ -367,8 +372,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
- if (!probe_pass)
- insn->off = off;
+ if (!probe_pass) {
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ insn->imm = off;
+ else
+ insn->off = off;
+ }
return 0;
}
@@ -502,6 +511,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
+ int err;
+
/* Branch offsets can't overflow when program is shrinking, no need
* to call bpf_adj_branches(..., true) here
*/
@@ -509,14 +520,16 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
sizeof(struct bpf_insn) * (prog->len - off - cnt));
prog->len -= cnt;
- return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+ err = bpf_adj_branches(prog, off, off + cnt, off, false);
+ WARN_ON_ONCE(err);
+ return err;
}
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
- for (i = 0; i < fp->aux->func_cnt; i++)
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
bpf_prog_kallsyms_del(fp->aux->func[i]);
}
@@ -537,13 +550,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -569,7 +579,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
/* prog->aux->name will be ignored if full btf name is available */
- if (prog->aux->func_info_cnt) {
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -603,7 +613,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
if (val < ksym->start)
return -1;
- if (val >= ksym->end)
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
return 1;
return 0;
@@ -648,16 +662,10 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
return fp->jited && !bpf_prog_was_classic(fp);
}
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
- return list_empty(&fp->aux->ksym.lnode) ||
- fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !bpf_capable())
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
return;
bpf_prog_ksym_set_addr(fp);
@@ -665,6 +673,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -673,6 +698,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
return;
bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
}
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
@@ -683,11 +713,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+int __bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
{
struct bpf_ksym *ksym;
- char *ret = NULL;
+ int ret = 0;
rcu_read_lock();
ksym = bpf_ksym_find(addr);
@@ -695,9 +725,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long symbol_start = ksym->start;
unsigned long symbol_end = ksym->end;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
- ret = sym;
if (size)
*size = symbol_end - symbol_start;
if (off)
@@ -719,9 +748,12 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
- struct bpf_ksym *ksym = bpf_ksym_find(addr);
+ struct bpf_ksym *ksym;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ ksym = bpf_ksym_find(addr);
return ksym && ksym->prog ?
container_of(ksym, struct bpf_prog_aux, ksym)->prog :
@@ -761,7 +793,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
if (it++ != symnum)
continue;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
*value = ksym->start;
*type = BPF_SYM_ELF_TYPE;
@@ -797,7 +829,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return -EINVAL;
}
- tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+ tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
if (!tab)
return -ENOMEM;
@@ -808,6 +840,166 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
+#else
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
+#endif
+
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_prog_pack *pack;
+ int err;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr)
+ goto out;
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ err = set_memory_rox((unsigned long)pack->ptr,
+ BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ if (err)
+ goto out;
+ list_add_tail(&pack->list, &pack_list);
+ return pack;
+
+out:
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ return NULL;
+}
+
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = bpf_jit_alloc_exec(size);
+ if (ptr) {
+ int err;
+
+ bpf_fill_ill_insns(ptr, size);
+ set_vm_flush_reset_perms(ptr);
+ err = set_memory_rox((unsigned long)ptr,
+ size / PAGE_SIZE);
+ if (err) {
+ bpf_jit_free_exec(ptr);
+ ptr = NULL;
+ }
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack(bpf_fill_ill_insns);
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+void bpf_prog_pack_free(void *ptr, u32 size)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
+ goto out;
+ }
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -827,18 +1019,17 @@ static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,19 +1037,19 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
{
- return module_alloc(size);
+ return execmem_alloc(EXECMEM_BPF, size);
}
void __weak bpf_jit_free_exec(void *addr)
{
- module_memfree(addr);
+ execmem_free(addr);
}
struct bpf_binary_header *
@@ -867,7 +1058,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -877,23 +1068,22 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -903,10 +1093,120 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_prog_pack_free(ro_header, size);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = get_random_u32_below(hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header, ro_header->size);
+ return PTR_ERR(ptr);
+ }
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header, size);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & PAGE_MASK;
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -919,7 +1219,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_free(hdr);
-
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -933,6 +1232,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
s16 off = insn->off;
s32 imm = insn->imm;
u8 *addr;
+ int err;
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
if (!*func_addr_fixed) {
@@ -943,10 +1243,15 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
if (!extra_pass)
addr = NULL;
else if (prog->aux->func &&
- off >= 0 && off < prog->aux->func_cnt)
+ off >= 0 && off < prog->aux->real_func_cnt)
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
} else {
/* Address of a BPF helper call. Since part of the core
* kernel, it's always at a fixed location. __bpf_call_base
@@ -960,13 +1265,20 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
return 0;
}
+const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
+{
+ if (prog->aux->ksym.prog)
+ return prog->aux->ksym.name;
+ return prog->aux->name;
+}
+
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
struct bpf_insn *to_buff,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -1011,7 +1323,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU | BPF_MOD | BPF_K:
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_ALU64 | BPF_ADD | BPF_K:
@@ -1025,7 +1337,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU64 | BPF_MOD | BPF_K:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_JMP | BPF_JEQ | BPF_K:
@@ -1138,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
bpf_prog_clone_free(fp_other);
}
+static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_map *map;
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ bpf_insn_array_adjust(map, off, len);
+ }
+#endif
+}
+
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
struct bpf_insn insn_buff[16], aux[2];
@@ -1146,7 +1475,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1157,6 +1486,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn = clone->insnsi;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* ld_imm64 with an address of bpf subprog is not
+ * a user controlled constant. Don't randomize it,
+ * since it will conflict with jit_subprogs() logic.
+ */
+ insn++;
+ i++;
+ continue;
+ }
+
/* We temporarily need to hold the original ld64 insn
* so that we can still access the first part in the
* second blinding run.
@@ -1183,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
clone = tmp;
insn_delta = rewritten - 1;
+ /* Instructions arrays must be updated using absolute xlated offsets */
+ adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
+
/* Walk new program and skip insns we just inserted. */
insn = clone->insnsi + i + insn_delta;
insn_cnt += insn_delta;
@@ -1253,6 +1595,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(ALU64, DIV, X), \
INSN_3(ALU64, MOD, X), \
INSN_2(ALU64, NEG), \
+ INSN_3(ALU64, END, TO_LE), \
/* Immediate based. */ \
INSN_3(ALU64, ADD, K), \
INSN_3(ALU64, SUB, K), \
@@ -1321,14 +1664,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSLE, K), \
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
+ INSN_2(JMP32, JA), \
+ /* Atomic operations. */ \
+ INSN_3(STX, ATOMIC, B), \
+ INSN_3(STX, ATOMIC, H), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, ATOMIC, W), \
- INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -1340,6 +1687,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, H), \
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
+ INSN_3(LDX, MEMSX, B), \
+ INSN_3(LDX, MEMSX, H), \
+ INSN_3(LDX, MEMSX, W), \
/* Immediate based. */ \
INSN_3(LD, IMM, DW)
@@ -1358,6 +1708,8 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JA | BPF_X] = true,
+ [BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
@@ -1365,12 +1717,6 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
-{
- memset(dst, 0, size);
- return -EFAULT;
-}
-
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1396,6 +1742,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
};
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
@@ -1463,13 +1812,36 @@ select_insn:
DST = -DST;
CONT;
ALU_MOV_X:
- DST = (u32) SRC;
+ switch (OFF) {
+ case 0:
+ DST = (u32) SRC;
+ break;
+ case 8:
+ DST = (u32)(s8) SRC;
+ break;
+ case 16:
+ DST = (u32)(s16) SRC;
+ break;
+ }
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
- DST = SRC;
+ switch (OFF) {
+ case 0:
+ DST = SRC;
+ break;
+ case 8:
+ DST = (s8) SRC;
+ break;
+ case 16:
+ DST = (s16) SRC;
+ break;
+ case 32:
+ DST = (s32) SRC;
+ break;
+ }
CONT;
ALU64_MOV_K:
DST = IMM;
@@ -1491,36 +1863,114 @@ select_insn:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- div64_u64_rem(DST, SRC, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, SRC);
+ DST = DST - AX * SRC;
+ break;
+ }
CONT;
ALU_MOD_X:
- AX = (u32) DST;
- DST = do_div(AX, (u32) SRC);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)SRC));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, IMM);
+ DST = DST - AX * IMM;
+ break;
+ }
CONT;
ALU_MOD_K:
- AX = (u32) DST;
- DST = do_div(AX, (u32) IMM);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)IMM));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_DIV_X:
- DST = div64_u64(DST, SRC);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, SRC);
+ break;
+ case 1:
+ DST = div64_s64(DST, SRC);
+ break;
+ }
CONT;
ALU_DIV_X:
- AX = (u32) DST;
- do_div(AX, (u32) SRC);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)SRC));
+ if (((s32)DST < 0) == ((s32)SRC < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU64_DIV_K:
- DST = div64_u64(DST, IMM);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, IMM);
+ break;
+ case 1:
+ DST = div64_s64(DST, IMM);
+ break;
+ }
CONT;
ALU_DIV_K:
- AX = (u32) DST;
- do_div(AX, (u32) IMM);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)IMM));
+ if (((s32)DST < 0) == ((s32)IMM < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1548,6 +1998,19 @@ select_insn:
break;
}
CONT;
+ ALU64_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) __swab16(DST);
+ break;
+ case 32:
+ DST = (__force u32) __swab32(DST);
+ break;
+ case 64:
+ DST = (__force u64) __swab64(DST);
+ break;
+ }
+ CONT;
/* CALL */
JMP_CALL:
@@ -1597,6 +2060,9 @@ out:
JMP_JA:
insn += insn->off;
CONT;
+ JMP32_JA:
+ insn += insn->imm;
+ CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
@@ -1639,18 +2105,17 @@ out:
#undef COND_JMP
/* ST, STX and LDX*/
ST_NOSPEC:
- /* Speculation barrier for mitigating Speculative Store Bypass.
- * In case of arm64, we rely on the firmware mitigation as
- * controlled via the ssbd kernel parameter. Whenever the
- * mitigation is enabled, it works for all of the kernel code
- * with no need to provide any additional instructions here.
- * In case of x86, we use 'lfence' insn for mitigation. We
- * reuse preexisting logic from Spectre v1 mitigation that
- * happens to produce the required code on x86 for v4 as well.
+ /* Speculation barrier for mitigating Speculative Store Bypass,
+ * Bounds-Check Bypass and Type Confusion. In case of arm64, we
+ * rely on the firmware mitigation as controlled via the ssbd
+ * kernel parameter. Whenever the mitigation is enabled, it
+ * works for all of the kernel code with no need to provide any
+ * additional instructions here. In case of x86, we use 'lfence'
+ * insn for mitigation. We reuse preexisting logic from Spectre
+ * v1 mitigation that happens to produce the required code on
+ * x86 for v4 as well.
*/
-#ifdef CONFIG_X86
barrier_nospec();
-#endif
CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -1661,6 +2126,11 @@ out:
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
LDST(B, u8)
@@ -1668,39 +2138,54 @@ out:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
-#define LDX_PROBE(SIZEOP, SIZE) \
- LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
+
+#define LDSX(SIZEOP, SIZE) \
+ LDX_MEMSX_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEMSX_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
- LDX_PROBE(B, 1)
- LDX_PROBE(H, 2)
- LDX_PROBE(W, 4)
- LDX_PROBE(DW, 8)
-#undef LDX_PROBE
+
+ LDSX(B, s8)
+ LDSX(H, s16)
+ LDSX(W, s32)
+#undef LDSX
#define ATOMIC_ALU_OP(BOP, KOP) \
case BOP: \
if (BPF_SIZE(insn->code) == BPF_W) \
atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
(DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
(DST + insn->off)); \
+ else \
+ goto default_label; \
break; \
case BOP | BPF_FETCH: \
if (BPF_SIZE(insn->code) == BPF_W) \
SRC = (u32) atomic_fetch_##KOP( \
(u32) SRC, \
(atomic_t *)(unsigned long) (DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
SRC = (u64) atomic64_fetch_##KOP( \
(u64) SRC, \
(atomic64_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ goto default_label; \
break;
STX_ATOMIC_DW:
STX_ATOMIC_W:
+ STX_ATOMIC_H:
+ STX_ATOMIC_B:
switch (IMM) {
+ /* Atomic read-modify-write instructions support only W and DW
+ * size modifiers.
+ */
ATOMIC_ALU_OP(BPF_ADD, add)
ATOMIC_ALU_OP(BPF_AND, and)
ATOMIC_ALU_OP(BPF_OR, or)
@@ -1712,20 +2197,63 @@ out:
SRC = (u32) atomic_xchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
SRC = (u64) atomic64_xchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) SRC);
+ else
+ goto default_label;
break;
case BPF_CMPXCHG:
if (BPF_SIZE(insn->code) == BPF_W)
BPF_R0 = (u32) atomic_cmpxchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) BPF_R0, (u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
BPF_R0 = (u64) atomic64_cmpxchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) BPF_R0, (u64) SRC);
+ else
+ goto default_label;
+ break;
+ /* Atomic load and store instructions support all size
+ * modifiers.
+ */
+ case BPF_LOAD_ACQ:
+ switch (BPF_SIZE(insn->code)) {
+#define LOAD_ACQUIRE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ DST = (SIZE)smp_load_acquire( \
+ (SIZE *)(unsigned long)(SRC + insn->off)); \
+ break;
+ LOAD_ACQUIRE(B, u8)
+ LOAD_ACQUIRE(H, u16)
+ LOAD_ACQUIRE(W, u32)
+#ifdef CONFIG_64BIT
+ LOAD_ACQUIRE(DW, u64)
+#endif
+#undef LOAD_ACQUIRE
+ default:
+ goto default_label;
+ }
+ break;
+ case BPF_STORE_REL:
+ switch (BPF_SIZE(insn->code)) {
+#define STORE_RELEASE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ smp_store_release( \
+ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \
+ break;
+ STORE_RELEASE(B, u8)
+ STORE_RELEASE(H, u16)
+ STORE_RELEASE(W, u32)
+#ifdef CONFIG_64BIT
+ STORE_RELEASE(DW, u64)
+#endif
+#undef STORE_RELEASE
+ default:
+ goto default_label;
+ }
break;
default:
@@ -1751,8 +2279,9 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
return ___bpf_prog_run(regs, insn); \
@@ -1766,6 +2295,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG]; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
BPF_R2 = r2; \
@@ -1800,14 +2330,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
-static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
- const struct bpf_insn *insn) = {
+static __maybe_unused
+u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
+#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
@@ -1816,8 +2348,9 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
__bpf_call_base_args;
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
+#endif
+#endif
-#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
@@ -1827,33 +2360,84 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
WARN_ON_ONCE(1);
return 0;
}
-#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
- bool ret;
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
+ struct bpf_prog_aux *aux = fp->aux;
+ enum bpf_cgroup_storage_type i;
+ bool ret = false;
+ u64 cookie;
if (fp->kprobe_override)
- return false;
-
- spin_lock(&array->aux->owner.lock);
+ return ret;
- if (!array->aux->owner.type) {
- /* There's no owner yet where we could check for
- * compatibility.
- */
- array->aux->owner.type = fp->type;
- array->aux->owner.jited = fp->jited;
+ spin_lock(&map->owner_lock);
+ /* There's no owner yet where we could check for compatibility. */
+ if (!map->owner) {
+ map->owner = bpf_map_owner_alloc(map);
+ if (!map->owner)
+ goto err;
+ map->owner->type = prog_type;
+ map->owner->jited = fp->jited;
+ map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->expected_attach_type = fp->expected_attach_type;
+ map->owner->attach_func_proto = aux->attach_func_proto;
+ for_each_cgroup_storage_type(i) {
+ map->owner->storage_cookie[i] =
+ aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ }
ret = true;
} else {
- ret = array->aux->owner.type == fp->type &&
- array->aux->owner.jited == fp->jited;
+ ret = map->owner->type == prog_type &&
+ map->owner->jited == fp->jited &&
+ map->owner->xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ map->owner->expected_attach_type != fp->expected_attach_type)
+ ret = false;
+ for_each_cgroup_storage_type(i) {
+ if (!ret)
+ break;
+ cookie = aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ ret = map->owner->storage_cookie[i] == cookie ||
+ !cookie;
+ }
+ if (ret &&
+ map->owner->attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
}
- spin_unlock(&array->aux->owner.lock);
+err:
+ spin_unlock(&map->owner_lock);
return ret;
}
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ return __bpf_prog_map_compatible(map, fp);
+}
+
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -1862,13 +2446,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp)) {
+ if (!__bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -1879,15 +2461,27 @@ out:
return ret;
}
-static void bpf_prog_select_func(struct bpf_prog *fp)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
+ bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+ u32 idx = (round_up(stack_depth, 32) / 32) - 1;
- fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+ /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
+ * But for non-JITed programs, we don't need bpf_func, so no bounds
+ * check needed.
+ */
+ if (idx < ARRAY_SIZE(interpreters)) {
+ fp->bpf_func = interpreters[idx];
+ select_interpreter = true;
+ } else {
+ fp->bpf_func = __bpf_prog_ret0_warn;
+ }
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
+ return select_interpreter;
}
/**
@@ -1915,7 +2509,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
bpf_prog_has_kfunc_call(fp))
jit_needed = true;
- bpf_prog_select_func(fp);
+ if (!bpf_prog_select_interpreter(fp))
+ jit_needed = true;
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1923,7 +2518,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -1941,7 +2536,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
finalize:
- bpf_prog_lock_ro(fp);
+ *err = bpf_prog_lock_ro(fp);
+ if (*err)
+ return fp;
/* The tail call compatibility check can only be done at
* this late stage as we need to determine, if we deal
@@ -1968,37 +2565,51 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
+ struct bpf_prog_array *p;
+
if (prog_cnt)
- return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog_array_item) *
- (prog_cnt + 1),
- flags);
+ p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
+ else
+ p = &bpf_empty_prog_array.hdr;
- return &empty_prog_array.hdr;
+ return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
@@ -2228,12 +2839,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
+ bool sleepable;
u32 i;
+ sleepable = aux->prog->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
bpf_map_put(map);
}
}
@@ -2244,8 +2859,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
- struct btf_mod_pair *used_btfs, u32 len)
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct btf_mod_pair *btf_mod;
@@ -2262,7 +2876,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
- __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
kfree(aux->used_btfs);
}
@@ -2274,18 +2888,23 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+ bpf_prog_stream_free(aux->prog);
+#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++) {
+ for (i = 0; i < aux->real_func_cnt; i++) {
/* We can just unlink the subprog poke descriptor table as
* it was originally linked to the main program and is also
* released along with it.
@@ -2293,7 +2912,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
}
- if (aux->func_cnt) {
+ if (aux->real_func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
@@ -2307,12 +2926,13 @@ void bpf_prog_free(struct bpf_prog *fp)
if (aux->dst_prog)
bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
-/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
void bpf_user_rnd_init_once(void)
@@ -2350,6 +2970,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;
@@ -2360,6 +2981,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
@@ -2370,6 +2992,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -2381,6 +3005,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -2391,7 +3020,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output);
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
- .func = NULL,
+ /* func is unused for tail_call, we set it to pass the
+ * get_helper_proto check
+ */
+ .func = BPF_PTR_POISON,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_CTX,
@@ -2415,7 +3047,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}
-bool __weak bpf_helper_changes_pkt_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
return false;
}
@@ -2433,11 +3065,82 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
+ * all archs. The value returned must not change at runtime as there is
+ * currently no support for reloading programs that were loaded without
+ * mitigations.
+ */
+bool __weak bpf_jit_bypass_spec_v1(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_bypass_spec_v4(void)
+{
+ return false;
+}
+
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+ return false;
+}
+
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
}
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_arena(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ return false;
+}
+
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+ return TASK_SIZE;
+#else
+ return 0;
+#endif
+}
+
+/* Return TRUE if the JIT backend satisfies the following two conditions:
+ * 1) JIT backend supports atomic_xchg() on pointer-sized words.
+ * 2) Under the specific arch, the implementation of xchg() is the same
+ * as atomic_xchg() on pointer-sized words.
+ */
+bool __weak bpf_jit_supports_ptr_xchg(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2447,12 +3150,104 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
-int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *addr1, void *addr2)
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+ enum bpf_text_poke_type new_t, void *old_addr,
+ void *new_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
{
return -ENOTSUPP;
}
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
+bool __weak bpf_jit_supports_timed_may_goto(void)
+{
+ return false;
+}
+
+u64 __weak arch_bpf_timed_may_goto(void)
+{
+ return 0;
+}
+
+static noinline void bpf_prog_report_may_goto_violation(void)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
+ bpf_stream_dump_stack(ss);
+ }));
+#endif
+}
+
+u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
+{
+ u64 time = ktime_get_mono_fast_ns();
+
+ /* Populate the timestamp for this stack frame, and refresh count. */
+ if (!p->timestamp) {
+ p->timestamp = time;
+ return BPF_MAX_TIMED_LOOPS;
+ }
+ /* Check if we've exhausted our time slice, and zero count. */
+ if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
+ bpf_prog_report_may_goto_violation();
+ return 0;
+ }
+ /* Refresh the count for the stack frame. */
+ return BPF_MAX_TIMED_LOOPS;
+}
+
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
@@ -2462,3 +3257,84 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
+ const char **linep, int *nump)
+{
+ int idx = -1, insn_start, insn_end, len;
+ struct bpf_line_info *linfo;
+ void **jited_linfo;
+ struct btf *btf;
+ int nr_linfo;
+
+ btf = prog->aux->btf;
+ linfo = prog->aux->linfo;
+ jited_linfo = prog->aux->jited_linfo;
+
+ if (!btf || !linfo || !jited_linfo)
+ return -EINVAL;
+ len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
+
+ linfo = &prog->aux->linfo[prog->aux->linfo_idx];
+ jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
+
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + len;
+ nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
+
+ for (int i = 0; i < nr_linfo &&
+ linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
+ if (jited_linfo[i] >= (void *)ip)
+ break;
+ idx = i;
+ }
+
+ if (idx == -1)
+ return -ENOENT;
+
+ /* Get base component of the file path. */
+ *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
+ *filep = kbasename(*filep);
+ /* Obtain the source line, and strip whitespace in prefix. */
+ *linep = btf_name_by_offset(btf, linfo[idx].line_off);
+ while (isspace(**linep))
+ *linep += 1;
+ *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+ return 0;
+}
+
+struct walk_stack_ctx {
+ struct bpf_prog *prog;
+};
+
+static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct walk_stack_ctx *ctxp = cookie;
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return true;
+ /* Make sure we return the main prog if we found a subprog */
+ ctxp->prog = prog->aux->main_prog_aux->prog;
+ return false;
+}
+
+struct bpf_prog *bpf_prog_find_from_stack(void)
+{
+ struct walk_stack_ctx ctx = {};
+
+ arch_bpf_stack_walk(find_from_stack_cb, &ctx);
+ return ctx.prog;
+}
+
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b3e6b9422238..703e5df1f4ef 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,13 +4,16 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
@@ -21,15 +24,17 @@
#include <linux/filter.h>
#include <linux/ptr_ring.h>
#include <net/xdp.h>
+#include <net/hotdata.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/capability.h>
+#include <linux/completion.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
-#include <linux/netdevice.h> /* netif_receive_skb_list */
-#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/netdevice.h>
+#include <net/gro.h>
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
@@ -57,19 +62,16 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
- struct bpf_cpu_map *cmap;
-
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
struct bpf_cpumap_val value;
struct bpf_prog *prog;
+ struct gro_node gro;
- atomic_t refcnt; /* Control when this struct can be free'ed */
- struct rcu_head rcu;
-
- struct work_struct kthread_stop_wq;
+ struct completion kthread_running;
+ struct rcu_work free_work;
};
struct bpf_cpu_map {
@@ -78,16 +80,10 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -96,50 +92,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_cmap;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_cmap:
- kfree(cmap);
- return ERR_PTR(err);
-}
-
-static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- atomic_inc(&rcpu->refcnt);
-}
-
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
-{
- struct bpf_cpu_map_entry *rcpu;
-
- rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
-
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
- * as it waits until all in-flight call_rcu() callbacks complete.
- */
- rcu_barrier();
-
- /* kthread_stop will wake_up_process and wait for it to complete */
- kthread_stop(rcpu->kthread);
}
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
@@ -149,42 +121,36 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
- struct xdp_frame *xdpf;
-
- while ((xdpf = ptr_ring_consume(ring)))
- if (WARN_ON_ONCE(xdpf))
- xdp_return_frame(xdpf);
-}
+ void *ptr;
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- if (rcpu->prog)
- bpf_prog_put(rcpu->prog);
- /* The queue should be empty at this point */
- __cpu_map_ring_cleanup(rcpu->queue);
- ptr_ring_cleanup(rcpu->queue, NULL);
- kfree(rcpu->queue);
- kfree(rcpu);
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
}
}
-static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
- struct list_head *listp,
- struct xdp_cpumap_stats *stats)
+static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ void **skbs, u32 skb_n,
+ struct xdp_cpumap_stats *stats)
{
- struct sk_buff *skb, *tmp;
struct xdp_buff xdp;
- u32 act;
+ u32 act, pass = 0;
int err;
- list_for_each_entry_safe(skb, tmp, listp, list) {
+ for (u32 i = 0; i < skb_n; i++) {
+ struct sk_buff *skb = skbs[i];
+
act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
switch (act) {
case XDP_PASS:
+ skbs[pass++] = skb;
break;
case XDP_REDIRECT:
- skb_list_del_init(skb);
err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
rcpu->prog);
if (unlikely(err)) {
@@ -193,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
} else {
stats->redirect++;
}
- return;
+ break;
default:
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
@@ -201,23 +167,25 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
trace_xdp_exception(skb->dev, rcpu->prog, act);
fallthrough;
case XDP_DROP:
- skb_list_del_init(skb);
- kfree_skb(skb);
+ napi_consume_skb(skb, true);
stats->drop++;
- return;
+ break;
}
}
+
+ stats->pass += pass;
+
+ return pass;
}
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
{
- struct xdp_rxq_info rxq;
+ struct xdp_rxq_info rxq = {};
struct xdp_buff xdp;
int i, nframes = 0;
- xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
for (i = 0; i < n; i++) {
@@ -226,7 +194,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
int err;
rxq.dev = xdpf->dev_rx;
- rxq.mem = xdpf->mem;
+ rxq.mem.type = xdpf->mem_type;
/* TODO: report queue_index to xdp_rxq_info */
xdp_convert_frame_to_buff(xdpf, &xdp);
@@ -240,7 +208,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
stats->drop++;
} else {
frames[nframes++] = xdpf;
- stats->pass++;
}
break;
case XDP_REDIRECT:
@@ -263,42 +230,68 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
- xdp_clear_return_frame_no_direct();
+ stats->pass += nframes;
return nframes;
}
#define CPUMAP_BATCH 8
-static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
- int xdp_n, struct xdp_cpumap_stats *stats,
- struct list_head *list)
+struct cpu_map_ret {
+ u32 xdp_n;
+ u32 skb_n;
+};
+
+static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ void **skbs, struct cpu_map_ret *ret,
+ struct xdp_cpumap_stats *stats)
{
- int nframes;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
if (!rcpu->prog)
- return xdp_n;
+ goto out;
- rcu_read_lock_bh();
+ rcu_read_lock();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ xdp_set_return_frame_no_direct();
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+ ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
+ if (unlikely(ret->skb_n))
+ ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n,
+ stats);
if (stats->redirect)
xdp_do_flush();
- if (unlikely(!list_empty(list)))
- cpu_map_bpf_prog_run_skb(rcpu, list, stats);
-
- rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+ xdp_clear_return_frame_no_direct();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ rcu_read_unlock();
- return nframes;
+out:
+ if (unlikely(ret->skb_n) && ret->xdp_n)
+ memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs));
}
+static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
+{
+ /*
+ * If the ring is not empty, there'll be a new iteration soon, and we
+ * only need to do a full flush if a tick is long (> 1 ms).
+ * If the ring is empty, to not hold GRO packets in the stack for too
+ * long, do a full flush.
+ * This is equivalent to how NAPI decides whether to perform a full
+ * flush.
+ */
+ gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000);
+}
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
+ u32 packets = 0;
+ complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
/* When kthread gives stop order, then rcpu have been disconnected
@@ -309,11 +302,11 @@ static int cpu_map_kthread_run(void *data)
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m, nframes, xdp_n;
+ struct cpu_map_ret ret = { };
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- LIST_HEAD(list);
+ u32 i, n, m;
+ bool empty;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -322,10 +315,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
@@ -336,7 +331,7 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0, xdp_n = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page;
@@ -344,11 +339,11 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = f;
__ptr_clear_bit(0, &skb);
- list_add_tail(&skb->list, &list);
+ skbs[ret.skb_n++] = skb;
continue;
}
- frames[xdp_n++] = f;
+ frames[ret.xdp_n++] = f;
page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
@@ -358,46 +353,60 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
+ local_bh_disable();
+
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
- if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < nframes; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- kmem_alloc_drops += nframes;
- }
+ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats);
+ if (!ret.xdp_n)
+ goto stats;
+
+ m = napi_skb_cache_get_bulk(skbs, ret.xdp_n);
+ if (unlikely(m < ret.xdp_n)) {
+ for (i = m; i < ret.xdp_n; i++)
+ xdp_return_frame(frames[i]);
+
+ if (ret.skb_n)
+ memmove(&skbs[m], &skbs[ret.xdp_n],
+ ret.skb_n * sizeof(*skbs));
+
+ kmem_alloc_drops += ret.xdp_n - m;
+ ret.xdp_n = m;
}
- local_bh_disable();
- for (i = 0; i < nframes; i++) {
+ for (i = 0; i < ret.xdp_n; i++) {
struct xdp_frame *xdpf = frames[i];
- struct sk_buff *skb = skbs[i];
- skb = __xdp_build_skb_from_frame(xdpf, skb,
- xdpf->dev_rx);
- if (!skb) {
- xdp_return_frame(xdpf);
- continue;
- }
-
- list_add_tail(&skb->list, &list);
+ /* Can fail only when !skb -- already handled above */
+ __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx);
}
- netif_receive_skb_list(&list);
- /* Feedback loop via tracepoint */
+stats:
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
+ for (i = 0; i < ret.xdp_n + ret.skb_n; i++)
+ gro_receive_skb(&rcpu->gro, skbs[i]);
+
+ /* Flush either every 64 packets or in case of empty ring */
+ packets += n;
+ empty = __ptr_ring_empty(rcpu->queue);
+ if (packets >= NAPI_POLL_WEIGHT || empty) {
+ cpu_map_gro_flush(rcpu, empty);
+ packets = 0;
+ }
+
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
- put_cpu_map_entry(rcpu);
return 0;
}
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
struct bpf_prog *prog;
@@ -405,7 +414,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return -EINVAL;
}
@@ -456,30 +466,36 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->cpu = cpu;
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
+ gro_init(&rcpu->gro);
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
/* Setup kthread */
+ init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
if (IS_ERR(rcpu->kthread))
goto free_prog;
- get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
- get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
-
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
return rcpu;
free_prog:
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
free_ptr_ring:
+ gro_cleanup(&rcpu->gro);
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
kfree(rcpu->queue);
@@ -490,40 +506,41 @@ free_rcu:
return NULL;
}
-static void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
- rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ gro_cleanup(&rcpu->gro);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
free_percpu(rcpu->bulkq);
- /* Cannot kthread_stop() here, last put free rcpu resources */
- put_cpu_map_entry(rcpu);
+ kfree(rcpu);
}
-/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
- * ensure any driver rcu critical sections have completed, but this
- * does not guarantee a flush has happened yet. Because driver side
- * rcu_read_lock/unlock only protects the running XDP program. The
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
- * pending flush op doesn't fail.
- *
- * The bpf_cpu_map_entry is still used by the kthread, and there can
- * still be pending packets (in queue and percpu bulkq). A refcnt
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
- * resources.
- *
- * The rcu callback __cpu_map_entry_free flush remaining packets in
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
- * preemption, cannot call kthread_stop() to make sure queue is empty.
- * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU grace period before
- * stopping kthread, emptying the queue.
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
@@ -532,13 +549,12 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
- call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
- INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
- schedule_work(&old_rcpu->kthread_stop_wq);
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_percpu_wq, &old_rcpu->free_work);
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -546,13 +562,13 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries)
return -EINVAL;
- /* notice caller map_delete_elem() use preempt_disable() */
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpumap_val cpumap_value = {};
@@ -582,7 +598,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
- rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -598,16 +613,15 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
- * It does __not__ ensure pending flush operations (if any) are
- * complete.
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
*/
-
synchronize_rcu();
- /* For cpu_map the remote CPUs can still be using the entries
- * (struct bpf_cpu_map_entry).
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
*/
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
@@ -616,11 +630,11 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU grace-period */
- __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
@@ -665,13 +679,22 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
-static int cpu_map_btf_id;
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
@@ -681,8 +704,8 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_cpu_map",
- .map_btf_id = &cpu_map_btf_id,
+ .map_mem_usage = cpu_map_mem_usage,
+ .map_btf_id = &cpu_map_btf_ids[0],
.map_redirect = cpu_map_redirect,
};
@@ -725,7 +748,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -742,8 +764,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -775,9 +800,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -787,14 +811,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..9876c5fe6c2a
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ *
+ * Return:
+ * * A pointer to a new struct bpf_cpumask instance on success.
+ * * NULL if the BPF memory allocator is out of memory.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ *
+ * Return:
+ * * The struct bpf_cpumask pointer passed to the function.
+ *
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!refcount_dec_and_test(&cpumask->usage))
+ return;
+
+ bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
+}
+
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+ bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first nonzero bit in the struct cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first zero bit in the struct cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the
+ * AND of two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Find the index of the first nonzero bit of the AND of two cpumasks.
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ *
+ * Return:
+ * * The index of the first bit that is nonzero in both cpumask instances.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_first_and(src1, src2);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask)
+{
+ return cpumask_any_distribute(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of
+ * two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at
+ * least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_any_and_distribute(src1, src2);
+}
+
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ *
+ * Return:
+ * * The number of bits set in the mask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+ return cpumask_weight(cpumask);
+}
+
+/**
+ * bpf_cpumask_populate() - Populate the CPU mask from the contents of
+ * a BPF memory region.
+ *
+ * @cpumask: The cpumask being populated.
+ * @src: The BPF memory holding the bit pattern.
+ * @src__sz: Length of the BPF memory region in bytes.
+ *
+ * Return:
+ * * 0 if the struct cpumask * instance was populated successfully.
+ * * -EACCES if the memory region is too small to populate the cpumask.
+ * * -EINVAL if the memory region is not aligned to the size of a long
+ * and the architecture does not support efficient unaligned accesses.
+ */
+__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz)
+{
+ unsigned long source = (unsigned long)src;
+
+ /* The memory region must be large enough to populate the entire CPU mask. */
+ if (src__sz < bitmap_size(nr_cpu_ids))
+ return -EACCES;
+
+ /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ !IS_ALIGNED(source, sizeof(long)))
+ return -EINVAL;
+
+ bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids);
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU)
+BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release_dtor)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..83c4d9943084
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+ const struct bpf_crypto_type *type;
+ struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type: The string of crypto operation type.
+ * @reserved: Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ * @algo: The string of algorithm to initialize.
+ * @key: The cipher key used to init crypto algorithm.
+ * @key_len: The length of cipher key.
+ * @authsize: The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+ char type[14];
+ u8 reserved[2];
+ char algo[128];
+ u8 key[256];
+ u32 key_len;
+ u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type: The pointer to bpf crypto type
+ * @tfm: The pointer to instance of crypto API struct.
+ * @siv_len: Size of IV and state storage for cipher
+ * @rcu: The RCU head used to free the crypto context with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ */
+struct bpf_crypto_ctx {
+ const struct bpf_crypto_type *type;
+ void *tfm;
+ u32 siv_len;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -EEXIST;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (!strcmp(node->type->name, type->name))
+ goto unlock;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!node)
+ goto unlock;
+
+ node->type = type;
+ list_add(&node->list, &bpf_crypto_types);
+ err = 0;
+
+unlock:
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -ENOENT;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, type->name))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ err = 0;
+ break;
+ }
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+ const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+ struct bpf_crypto_type_list *node;
+
+ down_read(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, name))
+ continue;
+
+ if (try_module_get(node->type->owner))
+ type = node->type;
+ break;
+ }
+ up_read(&bpf_crypto_types_sem);
+
+ return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params: pointer to struct bpf_crypto_params which contains all the
+ * details needed to initialise crypto context.
+ * @params__sz: size of steuct bpf_crypto_params usef by bpf program
+ * @err: integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+ int *err)
+{
+ const struct bpf_crypto_type *type;
+ struct bpf_crypto_ctx *ctx;
+
+ if (!params || params->reserved[0] || params->reserved[1] ||
+ params__sz != sizeof(struct bpf_crypto_params)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ type = bpf_crypto_get_type(params->type);
+ if (IS_ERR(type)) {
+ *err = PTR_ERR(type);
+ return NULL;
+ }
+
+ if (!type->has_algo(params->algo)) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!!params->authsize ^ !!type->setauthsize) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!params->key_len || params->key_len > sizeof(params->key)) {
+ *err = -EINVAL;
+ goto err_module_put;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ *err = -ENOMEM;
+ goto err_module_put;
+ }
+
+ ctx->type = type;
+ ctx->tfm = type->alloc_tfm(params->algo);
+ if (IS_ERR(ctx->tfm)) {
+ *err = PTR_ERR(ctx->tfm);
+ goto err_free_ctx;
+ }
+
+ if (params->authsize) {
+ *err = type->setauthsize(ctx->tfm, params->authsize);
+ if (*err)
+ goto err_free_tfm;
+ }
+
+ *err = type->setkey(ctx->tfm, params->key, params->key_len);
+ if (*err)
+ goto err_free_tfm;
+
+ if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+ *err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+ refcount_set(&ctx->usage, 1);
+
+ return ctx;
+
+err_free_tfm:
+ type->free_tfm(ctx->tfm);
+err_free_ctx:
+ kfree(ctx);
+err_module_put:
+ module_put(type->owner);
+
+ return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+ struct bpf_crypto_ctx *ctx;
+
+ ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+ ctx->type->free_tfm(ctx->tfm);
+ module_put(ctx->type->owner);
+ kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+ if (!refcount_inc_not_zero(&ctx->usage))
+ return NULL;
+ return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->usage))
+ call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv,
+ bool decrypt)
+{
+ u32 src_len, dst_len, siv_len;
+ const u8 *psrc;
+ u8 *pdst, *piv;
+ int err;
+
+ if (__bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
+ src_len = __bpf_dynptr_size(src);
+ dst_len = __bpf_dynptr_size(dst);
+ if (!src_len || !dst_len || src_len > dst_len)
+ return -EINVAL;
+
+ if (siv_len != ctx->siv_len)
+ return -EINVAL;
+
+ psrc = __bpf_dynptr_data(src, src_len);
+ if (!psrc)
+ return -EINVAL;
+ pdst = __bpf_dynptr_data_rw(dst, dst_len);
+ if (!pdst)
+ return -EINVAL;
+
+ piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+ if (siv_len && !piv)
+ return -EINVAL;
+
+ err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+ : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+ return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+ {
+ .btf_id = bpf_crypto_dtor_ids[0],
+ .kfunc_btf_id = bpf_crypto_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &crypt_init_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+ ARRAY_SIZE(bpf_crypto_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fe019dbdb3f0..2625601de76e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -64,7 +65,6 @@ struct xdp_dev_bulk_queue {
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
- struct bpf_dtab *dtab;
struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
-static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+static int dev_map_alloc_check(union bpf_attr *attr)
{
u32 valsize = attr->value_size;
@@ -121,22 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
-
-
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2 */
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
- if (!dtab->n_buckets) /* Overflow check */
- return -EINVAL;
- }
-
- if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
@@ -159,16 +164,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
struct bpf_dtab *dtab;
int err;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -182,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -198,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -239,7 +248,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -324,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
- struct net_device *dev)
+ struct net_device *tx_dev,
+ struct net_device *rx_dev)
{
- struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_txq_info txq = { .dev = tx_dev };
+ struct xdp_rxq_info rxq = { .dev = rx_dev };
struct xdp_buff xdp;
int i, nframes = 0;
@@ -337,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
xdp_convert_frame_to_buff(xdpf, &xdp);
xdp.txq = &txq;
+ xdp.rxq = &rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
@@ -351,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
+ trace_xdp_exception(tx_dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame_rx_napi(xdpf);
@@ -379,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
}
if (bq->xdp_prog) {
- to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
if (!to_send)
goto out;
}
@@ -408,9 +420,8 @@ out:
* driver before returning from its napi->poll() routine. See the comment above
* xdp_do_flush() in filter.c.
*/
-void __dev_flush(void)
+void __dev_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -445,7 +456,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -459,6 +469,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
* are only ever modified together.
*/
if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
bq->dev_rx = dev_rx;
bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
@@ -473,10 +485,14 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
{
int err;
- if (!dev->netdev_ops->ndo_xdp_xmit)
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdpf->len);
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
@@ -531,11 +547,17 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
- if (!obj ||
- !obj->dev->netdev_ops->ndo_xdp_xmit)
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
return false;
return true;
@@ -656,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
int err;
@@ -679,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *nskb;
int err;
@@ -698,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
}
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog, struct bpf_map *map,
- bool exclude_ingress)
+ const struct bpf_prog *xdp_prog,
+ struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
@@ -742,9 +764,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
- if (!dst)
- continue;
-
if (is_ifindex_excluded(excluded_devices, num_excluded,
dst->dev->ifindex))
continue;
@@ -798,26 +817,28 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
- if (old_dev)
+ if (old_dev) {
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
unsigned long flags;
int ret = -ENOENT;
@@ -844,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_ATOMIC | __GFP_NOWARN,
+ GFP_NOWAIT,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -858,12 +879,12 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
dev->idx = idx;
- dev->dtab = dtab;
if (prog) {
dev->xdp_prog = prog;
dev->val.bpf_prog.id = prog->aux->id;
@@ -883,8 +904,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -919,19 +940,21 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -983,30 +1006,45 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_hash_lookup_elem);
}
-static int dev_map_btf_id;
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -1014,14 +1052,14 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_btf_id,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_map_redirect,
};
-static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -1029,8 +1067,8 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_hash_map_btf_id,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_hash_map_redirect,
};
@@ -1100,9 +1138,11 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!dev || netdev != dev->dev)
continue;
odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
- if (dev == odev)
+ if (dev == odev) {
call_rcu(&dev->rcu,
__dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
}
}
rcu_read_unlock();
@@ -1119,15 +1159,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 7b4afb7d96db..f8a3c7eb451e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -87,6 +87,17 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_alu_sign_string[16] = {
+ [BPF_DIV >> 4] = "s/=",
+ [BPF_MOD >> 4] = "s%=",
+};
+
+static const char *const bpf_movsx_string[4] = {
+ [0] = "(s8)",
+ [1] = "(s16)",
+ [3] = "(s32)",
+};
+
static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
@@ -101,6 +112,12 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
+static const char *const bpf_ldsx_string[] = {
+ [BPF_W >> 3] = "s32",
+ [BPF_H >> 3] = "s16",
+ [BPF_B >> 3] = "s8",
+};
+
static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
@@ -128,6 +145,44 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose,
insn->imm, insn->dst_reg);
}
+static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
+ insn->code, insn->dst_reg,
+ insn->imm, insn->dst_reg);
+}
+
+static bool is_sdiv_smod(const struct bpf_insn *insn)
+{
+ return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
+ insn->off == 1;
+}
+
+static bool is_movsx(const struct bpf_insn *insn)
+{
+ return BPF_OP(insn->code) == BPF_MOV &&
+ (insn->off == 8 || insn->off == 16 || insn->off == 32);
+}
+
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+ insn->off == BPF_ADDR_SPACE_CAST;
+}
+
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU (-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -138,7 +193,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
+ print_bpf_bswap_insn(verbose, cbs->private_data, insn);
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
@@ -146,18 +201,28 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
+ } else if (is_addr_space_cast(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n",
+ insn->code, insn->dst_reg,
+ insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+ } else if (is_mov_percpu_addr(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+ insn->code, insn->dst_reg, insn->src_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
insn->imm);
}
} else if (class == BPF_STX) {
@@ -202,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ) {
+ verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_STORE_REL) {
+ verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
} else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
@@ -218,13 +295,15 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
}
} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ BPF_MODE(insn->code) == BPF_MEM ?
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
+ bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
@@ -279,6 +358,16 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
+ verbose(cbs->private_data, "(%02x) gotox r%d\n",
+ insn->code, insn->dst_reg);
+ } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+ insn->src_reg == BPF_MAY_GOTO) {
+ verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
+ insn->code, insn->imm);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
@@ -295,7 +384,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_JMP32 ? 'w' : 'r',
insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
+ (u32)insn->imm, insn->off);
}
} else {
verbose(cbs->private_data, "(%02x) %s\n",
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -4,6 +4,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/static_call.h>
/* The BPF dispatcher is a multiway branch code generator. The
* dispatcher is a mechanism to avoid the performance penalty of an
@@ -85,12 +86,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,34 +100,38 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
- u32 noff;
- int err;
-
- if (!prev_num_progs) {
- old = NULL;
- noff = 0;
- } else {
- old = d->image + d->image_off;
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
noff = d->image_off ^ (PAGE_SIZE / 2);
- }
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
- err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
- if (err || !new)
- return;
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
+
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
- d->image_off = noff;
+ if (new)
+ d->image_off = noff;
}
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
@@ -140,10 +145,17 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
- bpf_image_ksym_add(d->image, &d->ksym);
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ bpf_prog_pack_free(d->image, PAGE_SIZE);
+ d->image = NULL;
+ goto out;
+ }
+ bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_add(&d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
new file mode 100644
index 000000000000..4dd7ef7c145c
--- /dev/null
+++ b/kernel/bpf/dmabuf_iter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Google LLC */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/dma-buf.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return dma_buf_iter_begin();
+}
+
+static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dma_buf *dmabuf = v;
+
+ ++*pos;
+
+ return dma_buf_iter_next(dmabuf);
+}
+
+struct bpf_iter__dmabuf {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct dma_buf *, dmabuf);
+};
+
+static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter_meta meta = {
+ .seq = seq,
+ };
+ struct bpf_iter__dmabuf ctx = {
+ .meta = &meta,
+ .dmabuf = v,
+ };
+ struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop);
+
+ if (prog)
+ return bpf_iter_run_prog(prog, &ctx);
+
+ return 0;
+}
+
+static int dmabuf_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __dmabuf_seq_show(seq, v, false);
+}
+
+static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct dma_buf *dmabuf = v;
+
+ if (dmabuf)
+ dma_buf_put(dmabuf);
+}
+
+static const struct seq_operations dmabuf_iter_seq_ops = {
+ .start = dmabuf_iter_seq_start,
+ .next = dmabuf_iter_seq_next,
+ .stop = dmabuf_iter_seq_stop,
+ .show = dmabuf_iter_seq_show,
+};
+
+static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "dmabuf iter\n");
+}
+
+static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
+ .seq_ops = &dmabuf_iter_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = 0,
+};
+
+static struct bpf_iter_reg bpf_dmabuf_reg_info = {
+ .target = "dmabuf",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_dmabuf_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__dmabuf, dmabuf),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &dmabuf_iter_seq_info,
+};
+
+DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf)
+BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf)
+
+static int __init dmabuf_iter_init(void)
+{
+ bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0];
+ return bpf_iter_reg_target(&bpf_dmabuf_reg_info);
+}
+
+late_initcall(dmabuf_iter_init);
+
+struct bpf_iter_dmabuf {
+ /*
+ * opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __aligned(8);
+
+/* Non-opaque version of bpf_iter_dmabuf */
+struct bpf_iter_dmabuf_kern {
+ struct dma_buf *dmabuf;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->dmabuf = NULL;
+ return 0;
+}
+
+__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ kit->dmabuf = dma_buf_iter_next(kit->dmabuf);
+ else
+ kit->dmabuf = dma_buf_iter_begin();
+
+ return kit->dmabuf;
+}
+
+__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ dma_buf_put(kit->dmabuf);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d29af9988f37..c8a9b27f8663 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -7,12 +7,16 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
+#include <asm/rqspinlock.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -60,30 +64,22 @@
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
- * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ rqspinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -91,6 +87,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -98,12 +96,15 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
- struct lock_class_key lockdep_key;
- int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
};
/* each htab element is struct htab_elem + key + value */
@@ -113,14 +114,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -132,65 +133,32 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab)) {
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
- &htab->lockdep_key);
- } else {
- spin_lock_init(&htab->buckets[i].lock);
- lockdep_set_class(&htab->buckets[i].lock,
- &htab->lockdep_key);
- }
+ raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
cond_resched();
}
}
-static inline int htab_lock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long *pflags)
+static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
{
unsigned long flags;
+ int ret;
- hash = hash & HASHTAB_MAP_LOCK_MASK;
-
- migrate_disable();
- if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
- __this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
- return -EBUSY;
- }
-
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
+ ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
+ if (ret)
+ return ret;
*pflags = flags;
-
return 0;
}
-static inline void htab_unlock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long flags)
+static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
{
- hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
- __this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -207,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static inline bool is_fd_htab(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
+{
+ return l->key + round_up(key_size, 8);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + key_size) = pptr;
+ *(void __percpu **)htab_elem_value(l, key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + key_size);
+ return *(void __percpu **)htab_elem_value(l, key_size);
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
{
- return *(void **)(l->key + roundup(map->key_size, 8));
+ return *(void **)htab_elem_value(l, map->key_size);
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
@@ -228,18 +206,20 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+/* Both percpu and fd htab support in-place update, so no need for
+ * extra elem. LRU itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
static bool htab_has_extra_elems(struct bpf_htab *htab)
{
- return !htab_is_percpu(htab) && !htab_is_lru(htab);
+ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_prealloced_timers(struct bpf_htab *htab)
+static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (likely(!map_value_has_timer(&htab->map)))
- return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -247,9 +227,38 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(elem, htab->map.key_size));
+ cond_resched();
+ }
+}
+
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ cond_resched();
+ }
cond_resched();
}
}
@@ -291,12 +300,9 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
- u32 key_size = htab->map.key_size;
-
+ bpf_map_inc_elem_count(&htab->map);
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, key_size);
- check_and_init_map_value(&htab->map,
- l->key + round_up(key_size, 8));
+ memcpy(l->key, key, htab->map.key_size);
return l;
}
@@ -412,17 +418,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !bpf_capable())
- /* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_BPF.
- */
- return -EPERM;
-
if (zero_seed && !capable(CAP_SYS_ADMIN))
/* Guard against local DoS, and discourage production use. */
return -EPERM;
@@ -455,6 +453,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -463,8 +464,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
- bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
- attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
/* percpu_lru means each cpu has its own LRU list.
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
* the map's value itself is percpu. percpu_lru has
@@ -473,14 +472,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- int err, i;
+ int err;
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
- lockdep_register_key(&htab->lockdep_key);
-
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
@@ -495,7 +492,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
num_possible_cpus());
}
- /* hash table size must be power of 2 */
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ err = -E2BIG;
+ if (htab->map.max_entries > 1UL << 31)
+ goto free_htab;
+
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
htab->elem_size = sizeof(struct htab_elem) +
@@ -505,10 +508,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
- err = -E2BIG;
- /* prevent zero size kmalloc and check for u32 overflow */
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ /* check for u32 overflow */
+ if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ goto free_htab;
+
+ err = bpf_map_init_elem_count(&htab->map);
+ if (err)
goto free_htab;
err = -ENOMEM;
@@ -516,37 +521,58 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
-
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
- htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
- sizeof(int),
- sizeof(int),
- GFP_USER);
- if (!htab->map_locked[i])
- goto free_map_locked;
- }
+ goto free_elem_count;
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
goto free_map_locked;
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
+ if (htab_has_extra_elems(htab)) {
err = alloc_extra_elems(htab);
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -554,17 +580,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
free_map_locked:
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+free_elem_count:
+ bpf_map_free_elem_count(&htab->map);
free_htab:
- lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -626,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -645,7 +675,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
return NULL;
}
@@ -684,7 +714,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
if (l) {
if (mark)
bpf_lru_node_set_ref(&l->lru_node);
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
}
return NULL;
@@ -725,12 +755,23 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+static void check_and_free_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
{
- if (unlikely(map_value_has_timer(&htab->map)))
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = htab_elem_value(elem, htab->map.key_size);
+
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -738,7 +779,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
*/
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
- struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct bpf_htab *htab = arg;
struct htab_elem *l = NULL, *tgt_l;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
@@ -750,19 +791,21 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return false;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
- check_and_free_timer(htab, l);
+ bpf_map_dec_elem_count(&htab->map);
break;
}
- htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+ htab_unlock_bucket(b, flags);
+ if (l == tgt_l)
+ check_and_free_fields(htab, l);
return l == tgt_l;
}
@@ -827,18 +870,11 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
- if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
- check_and_free_timer(htab, l);
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
+ check_and_free_fields(htab, l);
- htab_elem_free(htab, l);
+ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -848,37 +884,71 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) {
ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, true);
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_inc_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_dec_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
- check_and_free_timer(htab, l);
- __pcpu_freelist_push(&htab->freelist, &l->fnode);
+ bpf_map_dec_elem_count(&htab->map);
+ check_and_free_fields(htab, l);
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
+ void *ptr;
+
if (!onallcpus) {
/* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ ptr = this_cpu_ptr(pptr);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value_long(&htab->map, ptr, value + off);
+ bpf_obj_free_fields(htab->map.record, ptr);
off += size;
}
}
@@ -887,23 +957,20 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
- u32 size = round_up(htab->map.value_size, 8);
+ if (!onallcpus) {
int current_cpu = raw_smp_processor_id();
int cpu;
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
- size);
- else
- memset(per_cpu_ptr(pptr, cpu), 0, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
@@ -912,8 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
- return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
- BITS_PER_LONG == 64;
+ return is_fd_htab(htab) && BITS_PER_LONG == 64;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
@@ -933,7 +999,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
- htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -942,43 +1007,40 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
+ bpf_map_inc_elem_count(&htab->map);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_value(&htab->map,
- l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
- if (!pptr) {
- kfree(l_new);
+ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
+
+ if (!ptr) {
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = ptr;
+ pptr = *(void __percpu **)ptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -987,17 +1049,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
htab_elem_set_ptr(l_new, key_size, pptr);
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
- memcpy(l_new->key + round_up(key_size, 8), value, size);
+ memcpy(htab_elem_value(l_new, key_size), value, size);
} else {
- copy_map_value(&htab->map,
- l_new->key + round_up(key_size, 8),
- value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -1016,11 +1076,11 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
@@ -1031,8 +1091,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1042,7 +1101,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1053,7 +1112,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (l_old) {
/* grab the element lock and update value in place */
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
return 0;
}
@@ -1063,7 +1122,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1081,7 +1140,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
* and update element in place
*/
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
ret = 0;
goto err;
@@ -1101,25 +1160,31 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
- if (!htab_is_prealloc(htab))
- free_htab_elem(htab, l_old);
- else
- check_and_free_timer(htab, l_old);
+
+ /* l_old has already been stashed in htab->extra_elems, free
+ * its special fields before it is available for reuse.
+ */
+ if (htab_is_prealloc(htab))
+ check_and_free_fields(htab, l_old);
}
- ret = 0;
+ htab_unlock_bucket(b, flags);
+ if (l_old && !htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
+ return 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
return ret;
}
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
- check_and_free_timer(htab, elem);
+ check_and_free_fields(htab, elem);
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1133,8 +1198,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1151,12 +1215,11 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- copy_map_value(&htab->map,
- l_new->key + round_up(map->key_size, 8), value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1175,8 +1238,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
+err_lock_bucket:
if (ret)
htab_lru_push_free(htab, l_new);
else if (l_old)
@@ -1185,13 +1249,14 @@ err:
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool percpu, bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
+ void *old_map_ptr = NULL;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -1201,8 +1266,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1211,7 +1275,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1222,27 +1286,35 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- /* per-cpu hash map can update value in-place */
- pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ /* Update value in-place */
+ if (percpu) {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ void **inner_map_pptr = htab_elem_value(l_old, key_size);
+
+ old_map_ptr = *inner_map_pptr;
+ WRITE_ONCE(*inner_map_pptr, *(void **)value);
+ }
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
}
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1256,8 +1328,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1277,9 +1348,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1301,27 +1372,30 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
- if (l_new)
+ htab_unlock_bucket(b, flags);
+err_lock_bucket:
+ if (l_new) {
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ }
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
- return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+ return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1331,8 +1405,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1340,24 +1413,24 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
-
- if (l) {
+ if (l)
hlist_nulls_del_rcu(&l->hash_node);
- free_htab_elem(htab, l);
- } else {
+ else
ret = -ENOENT;
- }
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
+
+ if (l)
+ free_htab_elem(htab, l);
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1367,8 +1440,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1376,7 +1448,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1387,7 +1459,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
else
ret = -ENOENT;
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
if (l)
htab_lru_push_free(htab, l);
return ret;
@@ -1397,6 +1469,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread and migration has been disabled,
+ * therefore, it is OK to invoke bpf_mem_cache_free() directly.
+ */
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1406,10 +1481,11 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_del_rcu(&l->hash_node);
htab_elem_free(htab, l);
}
+ cond_resched();
}
}
-static void htab_free_malloced_timers(struct bpf_htab *htab)
+static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
{
int i;
@@ -1419,51 +1495,59 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_nulls_for_each_entry(l, n, head, hash_node)
- check_and_free_timer(htab, l);
+ hlist_nulls_for_each_entry(l, n, head, hash_node) {
+ /* We only free internal structs on uref dropping to zero */
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(l, htab->map.key_size));
+ }
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers(struct bpf_map *map)
+static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- if (likely(!map_value_has_timer(&htab->map)))
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
return;
- if (!htab_is_prealloc(htab))
- htab_free_malloced_timers(htab);
+
+ if (htab_is_prealloc(htab))
+ htab_free_prealloced_internal_structs(htab);
else
- htab_free_prealloced_timers(htab);
+ htab_free_malloced_internal_structs(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- int i;
/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
* bpf_free_used_maps() is called after bpf prog is no longer executing.
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is responsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
- if (!htab_is_prealloc(htab))
+ if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
- else
+ } else {
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
+ }
+ bpf_map_free_elem_count(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
- lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1482,7 +1566,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -1505,48 +1589,48 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &bflags);
+ ret = htab_lock_bucket(b, &bflags);
if (ret)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
if (!l) {
ret = -ENOENT;
- } else {
- if (is_percpu) {
- u32 roundup_value_size = round_up(map->value_size, 8);
- void __percpu *pptr;
- int off = 0, cpu;
+ goto out_unlock;
+ }
- pptr = htab_elem_get_ptr(l, key_size);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
- off += roundup_value_size;
- }
- } else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
- if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
- else
- copy_map_value(map, value, l->key +
- roundup_key_size);
- check_and_init_map_value(map, value);
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
}
+ } else {
+ void *src = htab_elem_value(l, map->key_size);
- hlist_nulls_del_rcu(&l->hash_node);
- if (!is_lru_map)
- free_htab_elem(htab, l);
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, src, true);
+ else
+ copy_map_value(map, value, src);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
}
+ hlist_nulls_del_rcu(&l->hash_node);
- htab_unlock_bucket(htab, b, hash, bflags);
+out_unlock:
+ htab_unlock_bucket(b, bflags);
- if (is_lru_map && l)
- htab_lru_push_free(htab, l);
+ if (l) {
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
return ret;
}
@@ -1589,12 +1673,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
bool is_percpu)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
- u32 batch, max_count, size, bucket_size;
+ u32 batch, max_count, size, bucket_size, map_id;
+ u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1607,7 +1691,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
@@ -1629,14 +1713,13 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
return -ENOENT;
key_size = htab->map.key_size;
- roundup_key_size = round_up(htab->map.key_size, 8);
value_size = htab->map.value_size;
size = round_up(value_size, 8);
if (is_percpu)
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
@@ -1661,9 +1744,12 @@ again_nocopy:
head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
- ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ ret = htab_lock_bucket(b, &flags);
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
@@ -1681,7 +1767,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
goto after_loop;
@@ -1692,7 +1778,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
kvfree(keys);
@@ -1713,17 +1799,26 @@ again_nocopy:
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(dst_val + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
- value = l->key + roundup_key_size;
+ value = htab_elem_value(l, key_size);
+ if (is_fd_htab(htab)) {
+ struct bpf_map **inner_map = value;
+
+ /* Actual value is the id of the inner map */
+ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+ value = &map_id;
+ }
+
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value,
true);
else
copy_map_value(map, dst_val, value);
+ /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, dst_val);
}
if (do_delete) {
@@ -1733,25 +1828,29 @@ again_nocopy:
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
+ *
+ * For htab of maps, htab_put_fd_value() in
+ * free_htab_elem() may acquire a spinlock with bucket
+ * lock being held and it violates the lock rule, so
+ * invoke free_htab_elem() after unlock as well.
*/
- if (is_lru_map) {
- l->batch_flink = node_to_free;
- node_to_free = l;
- } else {
- free_htab_elem(htab, l);
- }
+ l->batch_flink = node_to_free;
+ node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
}
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- htab_lru_push_free(htab, l);
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
}
next_batch:
@@ -1956,11 +2055,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
{
struct bpf_iter_seq_hash_map_info *info = seq->private;
- u32 roundup_key_size, roundup_value_size;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
struct bpf_iter_meta meta;
int ret = 0, off = 0, cpu;
+ u32 roundup_value_size;
struct bpf_prog *prog;
void __percpu *pptr;
@@ -1970,17 +2069,16 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
ctx.meta = &meta;
ctx.map = info->map;
if (elem) {
- roundup_key_size = round_up(map->key_size, 8);
ctx.key = elem->key;
if (!info->percpu_value_buf) {
- ctx.value = elem->key + roundup_key_size;
+ ctx.value = htab_elem_value(elem, map->key_size);
} else {
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += roundup_value_size;
}
ctx.value = info->percpu_value_buf;
@@ -2023,6 +2121,7 @@ static int bpf_iter_init_hash_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
seq_info->htab = container_of(map, struct bpf_htab, map);
return 0;
@@ -2032,6 +2131,7 @@ static void bpf_iter_fini_hash_map(void *priv_data)
{
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
@@ -2049,14 +2149,13 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
struct htab_elem *elem;
- u32 roundup_key_size;
int i, num_elems = 0;
void __percpu *pptr;
struct bucket *b;
@@ -2064,29 +2163,29 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_f
bool is_percpu;
u64 ret = 0;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = htab_is_percpu(htab);
- roundup_key_size = round_up(map->key_size, 8);
- /* disable migration so percpu value prepared here will be the
- * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ /* migration has been disabled, so percpu value prepared here will be
+ * the same as the one seen by the bpf program with
+ * bpf_map_lookup_elem().
*/
- if (is_percpu)
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
- hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key;
if (is_percpu) {
/* current cpu value for percpu map */
pptr = htab_elem_get_ptr(elem, map->key_size);
val = this_cpu_ptr(pptr);
} else {
- val = elem->key + roundup_key_size;
+ val = htab_elem_value(elem, map->key_size);
}
num_elems++;
ret = callback_fn((u64)(long)map, (u64)(long)key,
@@ -2100,19 +2199,55 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_f
rcu_read_unlock();
}
out:
- if (is_percpu)
- migrate_enable();
return num_elems;
}
-static int htab_map_btf_id;
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2121,20 +2256,19 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
@@ -2144,9 +2278,9 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2161,6 +2295,40 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
+static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+ offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+
+ return insn - insn_buf;
+}
+
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l)
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ else
+ return NULL;
+}
+
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -2173,6 +2341,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
struct htab_elem *l;
@@ -2195,8 +2379,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
@@ -2216,8 +2400,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
ret = __htab_lru_percpu_map_update_elem(map, key, value,
map_flags, true);
else
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
- true);
+ ret = htab_map_update_elem_in_place(map, key, value, map_flags,
+ true, true);
rcu_read_unlock();
return ret;
@@ -2245,14 +2429,13 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
rcu_read_unlock();
}
-static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2260,19 +2443,20 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_gen_lookup = htab_percpu_map_gen_lookup,
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2283,12 +2467,13 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2313,7 +2498,7 @@ static void fd_htab_map_free(struct bpf_map *map)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
}
}
@@ -2340,21 +2525,26 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
return ret;
}
-/* only called from syscall */
+/* Only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
void *ptr;
int ret;
- u32 ufd = *(u32 *)value;
- ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
- ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ /* The htab bucket lock is always held during update operations in fd
+ * htab map, and the following rcu_read_lock() is only used to avoid
+ * the WARN_ON_ONCE in htab_map_update_elem_in_place().
+ */
+ rcu_read_lock();
+ ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
+ rcu_read_unlock();
if (ret)
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
return ret;
}
@@ -2412,7 +2602,6 @@ static void htab_of_map_free(struct bpf_map *map)
fd_htab_map_free(map);
}
-static int htab_of_maps_map_btf_id;
const struct bpf_map_ops htab_of_maps_map_ops = {
.map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
@@ -2425,6 +2614,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_of_maps_map_btf_id,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01cfdf40c838..db72b96f9c8c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,7 +2,9 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -14,8 +16,19 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
#include <linux/security.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
+#include <linux/bpf_verifier.h>
+#include <linux/uaccess.h>
+#include <linux/verification.h>
+#include <linux/task_work.h>
+#include <linux/irq_work.h>
+#include <linux/buildid.h>
#include "../../lib/kstrtox.h"
@@ -25,12 +38,12 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -46,7 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -63,7 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_delete_elem(map, key);
}
@@ -101,7 +114,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -114,7 +127,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
+};
+
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+ .func = bpf_map_lookup_percpu_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
@@ -132,6 +161,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.func = bpf_get_smp_processor_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .allow_fastcall = true,
};
BPF_CALL_0(bpf_get_numa_node_id)
@@ -180,6 +210,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -223,13 +265,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -257,6 +294,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
arch_spin_lock(l);
}
@@ -265,6 +303,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
+ preempt_enable();
}
#else
@@ -299,7 +338,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
__this_cpu_write(irqsave_flags, flags);
}
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
__bpf_spin_lock_irqsave(lock);
return 0;
@@ -310,6 +349,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
@@ -321,7 +361,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
local_irq_restore(flags);
}
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
__bpf_spin_unlock_irqrestore(lock);
return 0;
@@ -332,6 +372,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -340,9 +381,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
__bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
@@ -402,40 +443,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- struct bpf_cg_run_ctx *ctx;
- void *ptr;
-
- /* get current cgroup storage from BPF run context */
- ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
- storage = ctx->prog_item->cgroup_storage[stype];
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -513,16 +521,15 @@ static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
}
BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
- long *, res)
+ s64 *, res)
{
long long _res;
int err;
+ *res = 0;
err = __bpf_strtoll(buf, buf_len, flags, &_res);
if (err < 0)
return err;
- if (_res != (long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -534,23 +541,23 @@ const struct bpf_func_proto bpf_strtol_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(s64),
};
BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
- unsigned long *, res)
+ u64 *, res)
{
unsigned long long _res;
bool is_negative;
int err;
+ *res = 0;
err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
if (err < 0)
return err;
if (is_negative)
return -EINVAL;
- if (_res != (unsigned long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -562,20 +569,20 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
-#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
return strncmp(s1, s2, s1_sz);
}
-const struct bpf_func_proto bpf_strncmp_proto = {
+static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_CONST_STR,
};
@@ -665,18 +672,53 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
if (cpu >= nr_cpu_ids)
return (unsigned long)NULL;
- return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+ return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
}
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
@@ -689,7 +731,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
- return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+ return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
}
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
@@ -722,22 +764,14 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
return -EINVAL;
}
-/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
- * arguments representation.
- */
-#define MAX_BPRINTF_BUF_LEN 512
-
/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL 3
-struct bpf_bprintf_buffers {
- char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
-};
-static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
-static int try_get_fmt_tmp_buf(char **tmp_buf)
+int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
- struct bpf_bprintf_buffers *bufs;
int nest_level;
preempt_disable();
@@ -747,18 +781,24 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
preempt_enable();
return -EBUSY;
}
- bufs = this_cpu_ptr(&bpf_bprintf_bufs);
- *tmp_buf = bufs->tmp_bufs[nest_level - 1];
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
return 0;
}
-void bpf_bprintf_cleanup(void)
+void bpf_put_buffers(void)
{
- if (this_cpu_read(bpf_bprintf_nest_level)) {
- this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
- }
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
+}
+
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+{
+ if (!data->bin_args && !data->buf)
+ return;
+ bpf_put_buffers();
}
/*
@@ -767,18 +807,20 @@ void bpf_bprintf_cleanup(void)
* Returns a negative value if fmt is an invalid format string or 0 otherwise.
*
* This can be used in two ways:
- * - Format string verification only: when bin_args is NULL
+ * - Format string verification only: when data->get_bin_args is false
* - Arguments preparation: in addition to the above verification, it writes in
- * bin_args a binary representation of arguments usable by bstr_printf where
- * pointers from BPF have been sanitized.
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
*
* In argument preparation mode, if 0 is returned, safe temporary buffers are
* allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
-int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
- u32 **bin_args, u32 num_args)
+int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
+ u32 num_args, struct bpf_bprintf_data *data)
{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
size_t sizeof_cur_arg, sizeof_cur_ip;
int err, i, num_spec = 0;
u64 cur_arg;
@@ -789,14 +831,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return -EINVAL;
fmt_size = fmt_end - fmt;
- if (bin_args) {
- if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
- return -EBUSY;
+ if (get_buffers && bpf_try_get_buffers(&buffers))
+ return -EBUSY;
- tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
- *bin_args = (u32 *)tmp_buf;
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
}
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
for (i = 0; i < fmt_size; i++) {
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
@@ -834,6 +881,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
if (fmt[i] == 'p') {
sizeof_cur_arg = sizeof(long);
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1])) {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ goto nocopy_fmt;
+ }
+
if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
fmt[i + 2] == 's') {
fmt_ptype = fmt[i + 1];
@@ -841,11 +895,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
goto fmt_str;
}
- if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
- ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ if (fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
fmt[i + 1] == 'S') {
- /* just kernel pointers */
if (tmp_buf)
cur_arg = raw_args[num_spec];
i++;
@@ -990,31 +1042,33 @@ nocopy_fmt:
err = 0;
out:
if (err)
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(data);
return err;
}
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
* can safely give an unbounded size.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
if (err < 0)
return err;
- err = bstr_printf(str, str_size, fmt, bin_args);
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return err + 1;
}
@@ -1030,11 +1084,34 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
+{
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ *arr_idx = ((char *)value - array->value) / array->elem_size;
+ return arr_idx;
+ }
+ return (void *)value - round_up(map->key_size, 8);
+}
+
+struct bpf_async_cb {
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
+ u64 flags;
+};
+
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
- * That space is used to keep 'struct bpf_timer_kern'.
+ * That space is used to keep 'struct bpf_async_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
@@ -1047,35 +1124,49 @@ const struct bpf_func_proto bpf_snprintf_proto = {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
+ struct bpf_async_cb cb;
struct hrtimer timer;
- struct bpf_map *map;
- struct bpf_prog *prog;
- void __rcu *callback_fn;
- void *value;
+ atomic_t cancelling;
};
-/* the actual struct hidden inside uapi struct bpf_timer */
-struct bpf_timer_kern {
- struct bpf_hrtimer *timer;
+struct bpf_work {
+ struct bpf_async_cb cb;
+ struct work_struct work;
+ struct work_struct delete_work;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
+struct bpf_async_kern {
+ union {
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *timer;
+ struct bpf_work *work;
+ };
/* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space resereved by struct bpf_timer
+ * sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
*/
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
- struct bpf_map *map = t->map;
- void *value = t->value;
+ struct bpf_map *map = t->cb.map;
+ void *value = t->cb.value;
bpf_callback_t callback_fn;
void *key;
u32 idx;
- callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ BTF_TYPE_EMIT(struct bpf_timer);
+ callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1086,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
@@ -1104,57 +1188,163 @@ out:
return HRTIMER_NORESTART;
}
-BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
- u64, flags)
+static void bpf_wq_work(struct work_struct *work)
{
- clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_work *w = container_of(work, struct bpf_work, work);
+ struct bpf_async_cb *cb = &w->cb;
+ struct bpf_map *map = cb->map;
+ bpf_callback_t callback_fn;
+ void *value = cb->value;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ callback_fn = READ_ONCE(cb->callback_fn);
+ if (!callback_fn)
+ return;
+
+ key = map_key_from_value(map, value, &idx);
+
+ rcu_read_lock_trace();
+ migrate_disable();
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+ cancel_work_sync(&w->work);
+
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
+}
+
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * call_rcu() right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+}
+
+static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+ enum bpf_async_type type)
+{
+ struct bpf_async_cb *cb;
struct bpf_hrtimer *t;
+ struct bpf_work *w;
+ clockid_t clockid;
+ size_t size;
int ret = 0;
- BUILD_BUG_ON(MAX_CLOCKS != 16);
- BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
- BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
-
if (in_nmi())
return -EOPNOTSUPP;
- if (flags >= MAX_CLOCKS ||
- /* similar to timerfd except _ALARM variants are not supported */
- (clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME &&
- clockid != CLOCK_BOOTTIME))
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ size = sizeof(struct bpf_hrtimer);
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ size = sizeof(struct bpf_work);
+ break;
+ default:
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
+ }
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ t = async->timer;
if (t) {
ret = -EBUSY;
goto out;
}
+
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
+ if (!cb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ clockid = flags & (MAX_CLOCKS - 1);
+ t = (struct bpf_hrtimer *)cb;
+
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
+ hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
+ cb->value = (void *)async - map->record->timer_off;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ w = (struct bpf_work *)cb;
+
+ INIT_WORK(&w->work, bpf_wq_work);
+ INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+ cb->value = (void *)async - map->record->wq_off;
+ break;
+ }
+ cb->map = map;
+ cb->prog = NULL;
+ cb->flags = flags;
+ rcu_assign_pointer(cb->callback_fn, NULL);
+
+ WRITE_ONCE(async->cb, cb);
+ /* Guarantee the order between async->cb and map->usercnt. So
+ * when there are concurrent uref release and bpf timer init, either
+ * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+ * timer or atomic64_read() below returns a zero usercnt.
+ */
+ smp_mb();
if (!atomic64_read(&map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
+ WRITE_ONCE(async->cb, NULL);
+ kfree_nolock(cb);
ret = -EPERM;
- goto out;
- }
- /* allocate hrtimer via map_kmalloc to use memcg accounting */
- t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
- if (!t) {
- ret = -ENOMEM;
- goto out;
}
- t->value = (void *)timer - map->timer_off;
- t->map = map;
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
- timer->timer = t;
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clock_t clockid = flags & (MAX_CLOCKS - 1);
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+
+ return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
@@ -1164,22 +1354,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
- struct bpf_prog_aux *, aux)
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog_aux *aux, unsigned int flags,
+ enum bpf_async_type type)
{
struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
+ __bpf_spin_lock_irqsave(&async->lock);
+ cb = async->cb;
+ if (!cb) {
ret = -EINVAL;
goto out;
}
- if (!atomic64_read(&t->map->usercnt)) {
+ if (!atomic64_read(&cb->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
@@ -1188,7 +1379,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
ret = -EPERM;
goto out;
}
- prev = t->prog;
+ prev = cb->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
@@ -1201,14 +1392,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- t->prog = prog;
+ cb->prog = prog;
}
- rcu_assign_pointer(t->callback_fn, callback_fn);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@@ -1217,22 +1414,32 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
+ enum hrtimer_mode mode;
if (in_nmi())
return -EOPNOTSUPP;
- if (flags)
+ if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
- if (!t || !t->prog) {
+ if (!t || !t->cb.prog) {
ret = -EINVAL;
goto out;
}
- hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ if (flags & BPF_F_TIMER_CPU_PIN)
+ mode |= HRTIMER_MODE_PINNED;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
@@ -1247,45 +1454,77 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_hrtimer *t)
+static void drop_prog_refcnt(struct bpf_async_cb *async)
{
- struct bpf_prog *prog = t->prog;
+ struct bpf_prog *prog = async->prog;
if (prog) {
bpf_prog_put(prog);
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
+ async->prog = NULL;
+ rcu_assign_pointer(async->callback_fn, NULL);
}
}
-BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
- struct bpf_hrtimer *t;
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
+ rcu_read_lock();
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
ret = -EINVAL;
goto out;
}
- if (this_cpu_read(hrtimer_running) == t) {
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
- * since it waits for callback_fn to finish
+ * since it waits for callback_fn to finish.
*/
ret = -EDEADLK;
goto out;
}
- drop_prog_refcnt(t);
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+drop:
+ drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
+ rcu_read_unlock();
return ret;
}
@@ -1296,53 +1535,430 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_timer_kern *timer = val;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- /* Performance optimization: read timer->timer without lock first. */
- if (!READ_ONCE(timer->timer))
- return;
+ /* Performance optimization: read async->cb without lock first. */
+ if (!READ_ONCE(async->cb))
+ return NULL;
- __bpf_spin_lock_irqsave(&timer->lock);
+ __bpf_spin_lock_irqsave(&async->lock);
/* re-read it under lock */
- t = timer->timer;
- if (!t)
+ cb = async->cb;
+ if (!cb)
goto out;
- drop_prog_refcnt(t);
+ drop_prog_refcnt(cb);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
- timer->timer = NULL;
+ WRITE_ONCE(async->cb, NULL);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return cb;
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_hrtimer *t;
+
+ t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+
if (!t)
return;
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
- * right after for both preallocated and non-preallocated maps.
- * The timer->timer = NULL was already done and no code path can
- * see address 't' anymore.
- *
- * Check that bpf_map_delete/update_elem() wasn't called from timer
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
- * return -1). Though callback_fn is still running on this cpu it's
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since timer->timer = NULL was already done. The timer will be
+ * since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * call_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
+ */
+ if (this_cpu_read(hrtimer_running)) {
+ queue_work(system_dfl_wq, &t->cb.delete_work);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* If the timer is running on other CPU, also use a kworker to
+ * wait for the completion of the timer instead of trying to
+ * acquire a sleepable lock in hrtimer_cancel() to wait for its
+ * completion.
+ */
+ if (hrtimer_try_to_cancel(&t->timer) >= 0)
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+ else
+ queue_work(system_dfl_wq, &t->cb.delete_work);
+ } else {
+ bpf_timer_delete_work(&t->cb.delete_work);
+ }
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ struct bpf_work *work;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+ if (!work)
+ return;
+ /* Trigger cancel of the sleepable work, but *do not* wait for
+ * it to finish if it was running as we might not be in a
+ * sleepable context.
+ * kfree will be called once the work has finished.
*/
- if (this_cpu_read(hrtimer_running) != t)
- hrtimer_cancel(&t->timer);
- kfree(t);
+ schedule_work(&work->delete_work);
+}
+
+BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
+{
+ unsigned long *kptr = dst;
+
+ /* This helper may be inlined by verifier. */
+ return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
+ */
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
+ .func = bpf_kptr_xchg,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = BPF_PTR_POISON,
+ .arg1_type = ARG_KPTR_XCHG_DEST,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+ .arg2_btf_id = BPF_PTR_POISON,
+};
+
+struct bpf_dynptr_file_impl {
+ struct freader freader;
+ /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
+ u64 offset;
+ u64 size;
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE ((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT 28
+#define DYNPTR_SIZE_MASK 0xFFFFFF
+#define DYNPTR_RDONLY_BIT BIT(31)
+
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_RDONLY_BIT;
+}
+
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+ ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ return df->size;
+ }
+
+ return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->offset += off;
+ return;
+ }
+ ptr->offset += off;
+}
+
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
+{
+ u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->size = new_size;
+ return;
+ }
+ ptr->size = (u32)new_size | metadata;
+}
+
+int bpf_dynptr_check_size(u64 size)
+{
+ return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
+{
+ const void *ptr;
+
+ if (!buf)
+ return -EINVAL;
+
+ df->freader.buf = buf;
+ df->freader.buf_sz = len;
+ ptr = freader_fetch(&df->freader, offset + df->offset, len);
+ if (!ptr)
+ return df->freader.err;
+
+ if (ptr != buf) /* Force copying into the buffer */
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+ ptr->data = data;
+ ptr->offset = offset;
+ ptr->size = size;
+ bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+ memset(ptr, 0, sizeof(*ptr));
}
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ goto error;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+ return 0;
+
+error:
+ bpf_dynptr_set_null(ptr);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+ .func = bpf_dynptr_from_mem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
+};
+
+static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
+ u64 offset, u64 flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!src->data || flags)
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(src, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(src);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
+ return 0;
+ case BPF_DYNPTR_TYPE_FILE:
+ return bpf_file_fetch_bytes(src->data, offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
+ u64, offset, u64, flags)
+{
+ return __bpf_dynptr_read(dst, len, src, offset, flags);
+}
+
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
+ .func = bpf_dynptr_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
+ u64 len, u64 flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!dst->data || __bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(dst, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(dst);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+ len, flags);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
+ u64, len, u64, flags)
+{
+ return __bpf_dynptr_write(dst, offset, src, len, flags);
+}
+
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
+ .func = bpf_dynptr_write,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ case BPF_DYNPTR_TYPE_SKB_META:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
+ .func = bpf_dynptr_data,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1350,9 +1966,15 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
+const struct bpf_func_proto bpf_perf_event_read_proto __weak;
+const struct bpf_func_proto bpf_send_signal_proto __weak;
+const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_proto __weak;
+const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -1367,6 +1989,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -1379,6 +2003,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1389,17 +2015,23 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_loop:
- return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
default:
break;
}
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return NULL;
switch (func_id) {
@@ -1421,11 +2053,57 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_start_proto;
case BPF_FUNC_timer_cancel:
return &bpf_timer_cancel_proto;
+ case BPF_FUNC_kptr_xchg:
+ return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
default:
break;
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
switch (func_id) {
@@ -1435,6 +2113,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -1445,6 +2125,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_probe_read_kernel_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
@@ -1453,7 +2137,2474 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_task_pt_regs_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
+ case BPF_FUNC_perf_event_read_value:
+ return bpf_get_perf_event_read_value_proto();
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_get_task_stack:
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
+ default:
+ return NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(bpf_base_func_proto);
+
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->graph_root.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ }
+}
+
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->record, p);
+ return p;
+}
+
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ u64 size = local_type_id__k;
+
+ /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
+{
+ struct bpf_mem_alloc *ma;
+
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
+ else
+ ma = &bpf_global_ma;
+ bpf_mem_free_rcu(ma, p);
+}
+
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ /* The verifier has ensured that meta__ign must be NULL */
+ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
+}
+
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+ if (!refcount_inc_not_zero((refcount_t *)ref))
+ return NULL;
+
+ /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
+ * in verifier.c
+ */
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node_kern *node,
+ struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
+{
+ struct list_head *n = &node->list_head, *h = (void *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+
+ /* node->owner != NULL implies !list_empty(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ tail ? list_add_tail(n, h) : list_add(n, h);
+ WRITE_ONCE(node->owner, head);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+ struct bpf_list_node_kern *node;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+
+ n = tail ? h->prev : h->next;
+ node = container_of(n, struct bpf_list_node_kern, list_head);
+ if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ return NULL;
+
+ list_del_init(n);
+ WRITE_ONCE(node->owner, NULL);
+ return (struct bpf_list_node *)n;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->next;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->prev;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = &node_internal->rb_node;
+
+ /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
+ * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
+ */
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ WRITE_ONCE(node_internal->owner, NULL);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static int __bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node_kern *node,
+ void *less, struct btf_record *rec, u64 off)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = &node->rb_node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ bool leftmost = true;
+
+ /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ WRITE_ONCE(node->owner, root);
+ return 0;
+}
+
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_rb_node_kern *n = (void *)node;
+
+ return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
+{
+ put_task_struct_rcu_user(p);
+}
+
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+ put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ cgroup_put(cgrp);
+}
+
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+ cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
+ ancestor = cgrp->ancestors[level];
+ if (!cgroup_tryget(ancestor))
+ return NULL;
+ return ancestor;
+}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = __cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+ struct cgroup *ancestor)
+{
+ long ret;
+
+ rcu_read_lock();
+ ret = task_under_cgroup_hierarchy(task, ancestor);
+ rcu_read_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+/**
+ * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @task: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returen. On failure, NULL is returned.
+ */
+__bpf_kfunc struct cgroup *
+bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
+{
+ struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
+
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
+ * in the pid namespace of the current task. If a task is returned, it must
+ * either be stored in a map, or released with bpf_task_release().
+ * @vpid: The vpid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(vpid);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @p: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
+{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ enum bpf_dynptr_type type;
+ u64 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ if (buffer__opt)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ else
+ return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (!IS_ERR_OR_NULL(xdp_ptr))
+ return xdp_ptr;
+
+ if (!buffer__opt)
+ return NULL;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+ return buffer__opt;
+ }
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_FILE:
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
+ return err ? NULL : buffer__opt;
default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @p: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
+{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possibilities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
+}
+
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ u64 size;
+
+ if (!ptr->data || start > end)
+ return -EINVAL;
+
+ size = __bpf_dynptr_size(ptr);
+
+ if (start > size || end > size)
+ return -ERANGE;
+
+ bpf_dynptr_advance_offset(ptr, start);
+ bpf_dynptr_set_size(ptr, end - start);
+
+ return 0;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data)
+ return false;
+
+ return __bpf_dynptr_is_rdonly(ptr);
+}
+
+__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data)
+ return -EINVAL;
+
+ return __bpf_dynptr_size(ptr);
+}
+
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
+{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data) {
+ bpf_dynptr_set_null(clone);
+ return -EINVAL;
+ }
+
+ *clone = *ptr;
+
+ return 0;
+}
+
+/**
+ * bpf_dynptr_copy() - Copy data from one dynptr to another.
+ * @dst_ptr: Destination dynptr - where data should be copied to
+ * @dst_off: Offset into the destination dynptr
+ * @src_ptr: Source dynptr - where data should be copied from
+ * @src_off: Offset into the source dynptr
+ * @size: Length of the data to copy from source to destination
+ *
+ * Copies data from source dynptr to destination dynptr.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
+ struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
+{
+ struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+ struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+ void *src_slice, *dst_slice;
+ char buf[256];
+ u64 off;
+
+ src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
+ dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
+
+ if (src_slice && dst_slice) {
+ memmove(dst_slice, src_slice, size);
+ return 0;
+ }
+
+ if (src_slice)
+ return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
+
+ if (dst_slice)
+ return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
+
+ if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
+ bpf_dynptr_check_off_len(src, src_off, size))
+ return -E2BIG;
+
+ off = 0;
+ while (off < size) {
+ u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
+ int err;
+
+ err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+
+ off += chunk_sz;
+ }
+ return 0;
+}
+
+/**
+ * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
+ * @p: Destination dynptr - where data will be filled
+ * @offset: Offset into the dynptr to start filling from
+ * @size: Number of bytes to fill
+ * @val: Constant byte to fill the memory with
+ *
+ * Fills the @size bytes of the memory area pointed to by @p
+ * at @offset with the constant byte @val.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ u64 chunk_sz, write_off;
+ char buf[256];
+ void* slice;
+ int err;
+
+ slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
+ if (likely(slice)) {
+ memset(slice, val, size);
+ return 0;
+ }
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, size);
+ if (err)
+ return err;
+
+ /* Non-linear data under the dynptr, write from a local buffer */
+ chunk_sz = min_t(u64, sizeof(buf), size);
+ memset(buf, val, chunk_sz);
+
+ for (write_off = 0; write_off < size; write_off += chunk_sz) {
+ chunk_sz = min_t(u64, sizeof(buf), size - write_off);
+ err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
+{
+ return (void *)obj__ign;
+}
+
+__bpf_kfunc void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+__bpf_kfunc void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return !ctx->cnt;
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_map *map = p__map;
+
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_work *w;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ w = READ_ONCE(async->work);
+ if (!w || !READ_ONCE(w->cb.prog))
+ return -EINVAL;
+
+ schedule_work(&w->work);
+ return 0;
+}
+
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ void *aux__prog)
+{
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+ preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+ preempt_enable();
+}
+
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+#define BITS_ITER_NR_WORDS_MAX 511
+
+struct bpf_iter_bits_kern {
+ union {
+ __u64 *bits;
+ __u64 bits_copy;
+ };
+ int nr_bits;
+ int bit;
+} __aligned(8);
+
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ swap_ulong_in_u64(kit->bits, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
+
+ if (!nr_bits || bit >= nr_bits)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, bit + 1);
+ if (bit >= nr_bits) {
+ kit->bit = bit;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(!dst__sz))
+ return 0;
+
+ ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst, 0, dst__sz);
+
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst + ret, 0, dst__sz - ret);
+ else
+ ((char *)dst)[ret] = '\0';
+
+ return ret + 1;
+}
+
+/**
+ * bpf_copy_from_user_task_str() - Copy a string from an task's address space
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address in the task's address space.
+ * @tsk: The task whose address space will be used
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL terminated string from a task's address space to @dst__sz
+ * buffer. If user string is too long this will still ensure zero termination
+ * in the @dst__sz buffer unless buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
+ * and memset all of @dst__sz on failure.
+ *
+ * Return: The number of copied bytes on success including the NUL terminator.
+ * A negative error code on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
+ const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(dst__sz == 0))
+ return 0;
+
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst, 0, dst__sz);
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst + ret, 0, dst__sz - ret);
+
+ return ret + 1;
+}
+
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+ local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+ local_irq_restore(*flags__irq_flag);
+}
+
+__bpf_kfunc void __bpf_trap(void)
+{
+}
+
+/*
+ * Kfuncs for string operations.
+ *
+ * Since strings are not necessarily %NUL-terminated, we cannot directly call
+ * in-kernel implementations. Instead, we open-code the implementations using
+ * __get_kernel_nofault instead of plain dereference to make them safe.
+ */
+
+static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
+{
+ char c1, c2;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c1, s1, char, err_out);
+ __get_kernel_nofault(&c2, s2, char, err_out);
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (c1 == '\0')
+ return 0;
+ s1++;
+ s2++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, false);
+}
+
+/**
+ * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, true);
+}
+
+/**
+ * bpf_strnchr - Find a character in a length limited string
+ * @s__ign: The string to be searched
+ * @count: The number of characters to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in the first @count characters of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ return i;
+ if (sc == '\0')
+ return -ENOENT;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strchr - Find the first occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - The index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchr(const char *s__ign, char c)
+{
+ return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
+}
+
+/**
+ * bpf_strchrnul - Find and return a character in a string, or end of string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign or index of
+ * the null byte at the end of @s__ign when @c is not found
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == '\0' || sc == c)
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strrchr - Find the last occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the last occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
+{
+ char sc;
+ int i, last = -ENOENT;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ last = i;
+ if (sc == '\0')
+ return last;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strnlen - Calculate the length of a length-limited string
+ * @s__ign: The string
+ * @count: The maximum number of characters to count
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
+{
+ char c;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c, s__ign, char, err_out);
+ if (c == '\0')
+ return i;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : i;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strlen - Calculate the length of a string
+ * @s__ign: The string
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strlen(const char *s__ign)
+{
+ return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
+}
+
+/**
+ * bpf_strspn - Calculate the length of the initial substring of @s__ign which
+ * only contains letters in @accept__ign
+ * @s__ign: The string to be searched
+ * @accept__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which only
+ * contains letters from @accept__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
+{
+ char cs, ca;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&ca, accept__ign + j, char, err_out);
+ if (cs == ca || ca == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (ca == '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
+ * does not contain letters in @reject__ign
+ * @s__ign: The string to be searched
+ * @reject__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which does not
+ * contain letters from @reject__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
+{
+ char cs, cr;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&cr, reject__ign + j, char, err_out);
+ if (cs == cr || cr == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (cr != '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
+ bool ignore_case)
+{
+ char c1, c2;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&c2, s2 + j, char, err_out);
+ if (c2 == '\0')
+ return i;
+ /*
+ * We allow reading an extra byte from s2 (note the
+ * `i + j <= len` above) to cover the case when s2 is
+ * a suffix of the first len chars of s1.
+ */
+ if (i + j == len)
+ break;
+ __get_kernel_nofault(&c1, s1 + j, char, err_out);
+
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+
+ if (c1 == '\0')
+ return -ENOENT;
+ if (c1 != c2)
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (i + j == len)
+ return -ENOENT;
+ s1++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strstr - Find the first substring in a string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
+}
+
+/**
+ * bpf_strcasestr - Find the first substring in a string, ignoring the case of
+ * the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
+}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, false);
+}
+
+/**
+ * bpf_strncasestr - Find the first substring in a length-limited string,
+ * ignoring the case of the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, true);
+}
+
+#ifdef CONFIG_KEYS
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
return NULL;
}
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_p: data to verify
+ * @sig_p: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
+ struct bpf_key *trusted_keyring)
+{
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_BPF_SIGNATURE, NULL,
+ NULL);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+}
+#endif /* CONFIG_KEYS */
+
+typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
+
+enum bpf_task_work_state {
+ /* bpf_task_work is ready to be used */
+ BPF_TW_STANDBY = 0,
+ /* irq work scheduling in progress */
+ BPF_TW_PENDING,
+ /* task work scheduling in progress */
+ BPF_TW_SCHEDULING,
+ /* task work is scheduled successfully */
+ BPF_TW_SCHEDULED,
+ /* callback is running */
+ BPF_TW_RUNNING,
+ /* associated BPF map value is deleted */
+ BPF_TW_FREED,
+};
+
+struct bpf_task_work_ctx {
+ enum bpf_task_work_state state;
+ refcount_t refcnt;
+ struct callback_head work;
+ struct irq_work irq_work;
+ /* bpf_prog that schedules task work */
+ struct bpf_prog *prog;
+ /* task for which callback is scheduled */
+ struct task_struct *task;
+ /* the map and map value associated with this context */
+ struct bpf_map *map;
+ void *map_val;
+ enum task_work_notify_mode mode;
+ bpf_task_work_callback_t callback_fn;
+ struct rcu_head rcu;
+} __aligned(8);
+
+/* Actual type for struct bpf_task_work */
+struct bpf_task_work_kern {
+ struct bpf_task_work_ctx *ctx;
+};
+
+static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
+{
+ if (ctx->prog) {
+ bpf_prog_put(ctx->prog);
+ ctx->prog = NULL;
+ }
+ if (ctx->task) {
+ bpf_task_release(ctx->task);
+ ctx->task = NULL;
+ }
+}
+
+static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
+{
+ return refcount_inc_not_zero(&ctx->refcnt);
+}
+
+static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
+{
+ if (!refcount_dec_and_test(&ctx->refcnt))
+ return;
+
+ bpf_task_work_ctx_reset(ctx);
+
+ /* bpf_mem_free expects migration to be disabled */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, ctx);
+ migrate_enable();
+}
+
+static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
+{
+ /*
+ * Scheduled task_work callback holds ctx ref, so if we successfully
+ * cancelled, we put that ref on callback's behalf. If we couldn't
+ * cancel, callback will inevitably run or has already completed
+ * running, and it would have taken care of its ctx ref itself.
+ */
+ if (task_work_cancel(ctx->task, &ctx->work))
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_callback(struct callback_head *cb)
+{
+ struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
+ enum bpf_task_work_state state;
+ u32 idx;
+ void *key;
+
+ /* Read lock is needed to protect ctx and map key/value access */
+ guard(rcu_tasks_trace)();
+ /*
+ * This callback may start running before bpf_task_work_irq() switched to
+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
+ if (state == BPF_TW_SCHEDULED)
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
+ if (state == BPF_TW_FREED) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
+
+ migrate_disable();
+ ctx->callback_fn(ctx->map, key, ctx->map_val);
+ migrate_enable();
+
+ bpf_task_work_ctx_reset(ctx);
+ (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
+
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_irq(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+ enum bpf_task_work_state state;
+ int err;
+
+ guard(rcu_tasks_trace)();
+
+ if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ err = task_work_add(ctx->task, &ctx->work, ctx->mode);
+ if (err) {
+ bpf_task_work_ctx_reset(ctx);
+ /*
+ * try to switch back to STANDBY for another task_work reuse, but we might have
+ * gone to FREED already, which is fine as we already cleaned up after ourselves
+ */
+ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * It's technically possible for just scheduled task_work callback to
+ * complete running by now, going SCHEDULING -> RUNNING and then
+ * dropping its ctx refcount. Instead of capturing extra ref just to
+ * protected below ctx->state access, we rely on RCU protection to
+ * perform below SCHEDULING -> SCHEDULED attempt.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
+ if (state == BPF_TW_FREED)
+ bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_kern *twk = (void *)tw;
+ struct bpf_task_work_ctx *ctx, *old_ctx;
+
+ ctx = READ_ONCE(twk->ctx);
+ if (ctx)
+ return ctx;
+
+ ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ refcount_set(&ctx->refcnt, 1); /* map's own ref */
+ ctx->state = BPF_TW_STANDBY;
+
+ old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
+ if (old_ctx) {
+ /*
+ * tw->ctx is set by concurrent BPF program, release allocated
+ * memory and try to reuse already set context.
+ */
+ bpf_mem_free(&bpf_global_ma, ctx);
+ return old_ctx;
+ }
+
+ return ctx; /* Success */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_ctx *ctx;
+
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+
+ if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
+ bpf_task_work_ctx_put(ctx);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * If no process or bpffs is holding a reference to the map, no new callbacks should be
+ * scheduled. This does not address any race or correctness issue, but rather is a policy
+ * choice: dropping user references should stop everything.
+ */
+ if (!atomic64_read(&map->usercnt)) {
+ /* drop ref we just got for task_work callback itself */
+ bpf_task_work_ctx_put(ctx);
+ /* transfer map's ref into cancel_and_free() */
+ bpf_task_work_cancel_and_free(tw);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return ctx;
+}
+
+static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
+ struct bpf_map *map, bpf_task_work_callback_t callback_fn,
+ struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
+{
+ struct bpf_prog *prog;
+ struct bpf_task_work_ctx *ctx;
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_task_work);
+
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return -EBADF;
+ task = bpf_task_acquire(task);
+ if (!task) {
+ err = -EBADF;
+ goto release_prog;
+ }
+
+ ctx = bpf_task_work_acquire_ctx(tw, map);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto release_all;
+ }
+
+ ctx->task = task;
+ ctx->callback_fn = callback_fn;
+ ctx->prog = prog;
+ ctx->mode = mode;
+ ctx->map = map;
+ ctx->map_val = (void *)tw - map->record->task_work_off;
+ init_task_work(&ctx->work, bpf_task_work_callback);
+ init_irq_work(&ctx->irq_work, bpf_task_work_irq);
+
+ irq_work_queue(&ctx->irq_work);
+ return 0;
+
+release_all:
+ bpf_task_release(task);
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+/**
+ * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+}
+
+/**
+ * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+}
+
+static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_dynptr_file_impl *state;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ if (!state) {
+ bpf_dynptr_set_null(ptr);
+ return -ENOMEM;
+ }
+ state->offset = 0;
+ state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
+ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
+ bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
+ bpf_dynptr_set_rdonly(ptr);
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ if (!df)
+ return 0;
+
+ freader_cleanup(&df->freader);
+ bpf_mem_free(&bpf_global_ma, df);
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
+ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
+}
+
+void bpf_task_work_cancel_and_free(void *val)
+{
+ struct bpf_task_work_kern *twk = val;
+ struct bpf_task_work_ctx *ctx;
+ enum bpf_task_work_state state;
+
+ ctx = xchg(&twk->ctx, NULL);
+ if (!ctx)
+ return;
+
+ state = xchg(&ctx->state, BPF_TW_FREED);
+ if (state == BPF_TW_SCHEDULED) {
+ /* run in irq_work to avoid locks in NMI */
+ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
+ irq_work_queue(&ctx->irq_work);
+ return;
+ }
+
+ bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
+}
+
+BTF_KFUNCS_START(generic_btf_ids)
+#ifdef CONFIG_CRASH_DUMP
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
+
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
+BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
+#ifdef CONFIG_KEYS
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+#endif
+BTF_KFUNCS_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release_dtor)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release_dtor)
+#endif
+
+BTF_KFUNCS_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
+#endif
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_dynptr_copy)
+BTF_ID_FLAGS(func, bpf_dynptr_memset)
+#ifdef CONFIG_NET
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
+BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_kmem_cache)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
+BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#endif
+#ifdef CONFIG_DMA_SHARED_BUFFER
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#endif
+BTF_ID_FLAGS(func, __bpf_trap)
+BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strcasecmp);
+BTF_ID_FLAGS(func, bpf_strchr);
+BTF_ID_FLAGS(func, bpf_strchrnul);
+BTF_ID_FLAGS(func, bpf_strnchr);
+BTF_ID_FLAGS(func, bpf_strrchr);
+BTF_ID_FLAGS(func, bpf_strlen);
+BTF_ID_FLAGS(func, bpf_strnlen);
+BTF_ID_FLAGS(func, bpf_strspn);
+BTF_ID_FLAGS(func, bpf_strcspn);
+BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strcasestr);
+BTF_ID_FLAGS(func, bpf_strnstr);
+BTF_ID_FLAGS(func, bpf_strncasestr);
+#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
+BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
+#endif
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_KFUNCS_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+
+late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
+{
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
+{
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+ return (void *)__bpf_dynptr_data(ptr, len);
+}
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
+{
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, val);
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, val);
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, val);
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8d9f7467bf..9f866a010dad 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
#include "preload/bpf_preload.h"
enum bpf_type {
@@ -98,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
static const struct inode_operations bpf_map_iops = { };
static const struct inode_operations bpf_link_iops = { };
-static struct inode *bpf_get_inode(struct super_block *sb,
- const struct inode *dir,
- umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
{
struct inode *inode;
@@ -118,11 +119,9 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
- inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
+ simple_inode_init_ts(inode);
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -145,21 +144,19 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
struct inode *dir)
{
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &bpf_dir_iops;
inode->i_fop = &simple_dir_operations;
@@ -168,7 +165,7 @@ static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
inc_nlink(dir);
bpf_dentry_finalize(dentry, inode, dir);
- return 0;
+ return NULL;
}
struct map_iter {
@@ -382,7 +379,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -422,20 +419,16 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
struct dentry *dentry;
int ret;
- inode_lock(parent->d_inode);
- dentry = lookup_one_len(name, parent, strlen(name));
- if (IS_ERR(dentry)) {
- inode_unlock(parent->d_inode);
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry))
return PTR_ERR(dentry);
- }
ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
&bpf_iter_fops);
- dput(dentry);
- inode_unlock(parent->d_inode);
+ simple_done_creating(dentry);
return ret;
}
-static int bpf_obj_do_pin(const char __user *pathname, void *raw,
+static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -444,22 +437,21 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
+ dentry = start_creating_user_path(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-
- ret = security_path_mknod(&path, dentry, mode, 0);
- if (ret)
- goto out;
-
dir = d_inode(path.dentry);
if (dir->i_op != &bpf_dir_iops) {
ret = -EPERM;
goto out;
}
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ ret = security_path_mknod(&path, dentry, mode, 0);
+ if (ret)
+ goto out;
+
switch (type) {
case BPF_TYPE_PROG:
ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
@@ -474,11 +466,11 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
ret = -EPERM;
}
out:
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return ret;
}
-int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
{
enum bpf_type type;
void *raw;
@@ -488,14 +480,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
if (IS_ERR(raw))
return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pathname, raw, type);
+ ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
return ret;
}
-static void *bpf_obj_do_get(const char __user *pathname,
+static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -503,7 +495,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
void *raw;
int ret;
- ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -527,7 +519,7 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname, int flags)
+int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
int f_flags;
@@ -538,7 +530,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
if (f_flags < 0)
return f_flags;
- raw = bpf_obj_do_get(pathname, &type, f_flags);
+ raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
if (IS_ERR(raw))
return PTR_ERR(raw);
@@ -559,7 +551,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
@@ -598,19 +590,187 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
+struct bpffs_btf_enums {
+ const struct btf *btf;
+ const struct btf_type *cmd_t;
+ const struct btf_type *map_t;
+ const struct btf_type *prog_t;
+ const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+ const struct btf *btf;
+ const struct btf_type *t;
+ const char *name;
+ int i, n;
+
+ memset(info, 0, sizeof(*info));
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -ENOENT;
+
+ info->btf = btf;
+
+ for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (!btf_type_is_enum(t))
+ continue;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name)
+ continue;
+
+ if (strcmp(name, "bpf_cmd") == 0)
+ info->cmd_t = t;
+ else if (strcmp(name, "bpf_map_type") == 0)
+ info->map_t = t;
+ else if (strcmp(name, "bpf_prog_type") == 0)
+ info->prog_t = t;
+ else if (strcmp(name, "bpf_attach_type") == 0)
+ info->attach_t = t;
+ else
+ continue;
+
+ if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+ const char *prefix, const char *str, int *value)
+{
+ const struct btf_enum *e;
+ const char *name;
+ int i, n, pfx_len = strlen(prefix);
+
+ *value = 0;
+
+ if (!btf || !enum_t)
+ return false;
+
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+
+ /* match symbolic name case insensitive and ignoring prefix */
+ if (strcasecmp(name + pfx_len, str) == 0) {
+ *value = e->val;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+ const char *opt_name,
+ const struct btf *btf,
+ const struct btf_type *enum_t,
+ const char *prefix,
+ u64 delegate_msk, u64 any_msk)
+{
+ const struct btf_enum *e;
+ bool first = true;
+ const char *name;
+ u64 msk;
+ int i, n, pfx_len = strlen(prefix);
+
+ delegate_msk &= any_msk; /* clear unknown bits */
+
+ if (delegate_msk == 0)
+ return;
+
+ seq_printf(m, ",%s", opt_name);
+ if (delegate_msk == any_msk) {
+ seq_printf(m, "=any");
+ return;
+ }
+
+ if (btf && enum_t) {
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+ msk = 1ULL << e->val;
+ if (delegate_msk & msk) {
+ /* emit lower-case name without prefix */
+ seq_putc(m, first ? '=' : ':');
+ name += pfx_len;
+ while (*name) {
+ seq_putc(m, tolower(*name));
+ name++;
+ }
+
+ delegate_msk &= ~msk;
+ first = false;
+ }
+ }
+ }
+ if (delegate_msk)
+ seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
/*
* Display the mount options in /proc/mounts.
*/
static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
- umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
-
+ struct inode *inode = d_inode(root);
+ umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+ u64 mask;
+
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, inode->i_uid));
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
+
+ if (opts->delegate_cmds || opts->delegate_maps ||
+ opts->delegate_progs || opts->delegate_attachs) {
+ struct bpffs_btf_enums info;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ seq_print_delegate_opts(m, "delegate_cmds",
+ info.btf, info.cmd_t, "BPF_",
+ opts->delegate_cmds, mask);
+
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_maps",
+ info.btf, info.map_t, "BPF_MAP_TYPE_",
+ opts->delegate_maps, mask);
+
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_progs",
+ info.btf, info.prog_t, "BPF_PROG_TYPE_",
+ opts->delegate_progs, mask);
+
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_attachs",
+ info.btf, info.attach_t, "BPF_",
+ opts->delegate_attachs, mask);
+ }
+
return 0;
}
-static void bpf_free_inode(struct inode *inode)
+static void bpf_destroy_inode(struct inode *inode)
{
enum bpf_type type;
@@ -621,31 +781,41 @@ static void bpf_free_inode(struct inode *inode)
free_inode_nonrcu(inode);
}
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = bpf_show_options,
- .free_inode = bpf_free_inode,
+ .destroy_inode = bpf_destroy_inode,
};
enum {
+ OPT_UID,
+ OPT_GID,
OPT_MODE,
+ OPT_DELEGATE_CMDS,
+ OPT_DELEGATE_MAPS,
+ OPT_DELEGATE_PROGS,
+ OPT_DELEGATE_ATTACHS,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32 ("uid", OPT_UID),
+ fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
+ fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS),
+ fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS),
+ fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS),
+ fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS),
{}
};
-struct bpf_mount_opts {
- umode_t mode;
-};
-
static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = fc->s_fs_info;
struct fs_parse_result result;
- int opt;
+ kuid_t uid;
+ kgid_t gid;
+ int opt, err;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
if (opt < 0) {
@@ -666,12 +836,104 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
}
switch (opt) {
+ case OPT_UID:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, uid))
+ goto bad_value;
+
+ opts->uid = uid;
+ break;
+ case OPT_GID:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, gid))
+ goto bad_value;
+
+ opts->gid = gid;
+ break;
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
+ case OPT_DELEGATE_CMDS:
+ case OPT_DELEGATE_MAPS:
+ case OPT_DELEGATE_PROGS:
+ case OPT_DELEGATE_ATTACHS: {
+ struct bpffs_btf_enums info;
+ const struct btf_type *enum_t;
+ const char *enum_pfx;
+ u64 *delegate_msk, msk = 0;
+ char *p, *str;
+ int val;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ switch (opt) {
+ case OPT_DELEGATE_CMDS:
+ delegate_msk = &opts->delegate_cmds;
+ enum_t = info.cmd_t;
+ enum_pfx = "BPF_";
+ break;
+ case OPT_DELEGATE_MAPS:
+ delegate_msk = &opts->delegate_maps;
+ enum_t = info.map_t;
+ enum_pfx = "BPF_MAP_TYPE_";
+ break;
+ case OPT_DELEGATE_PROGS:
+ delegate_msk = &opts->delegate_progs;
+ enum_t = info.prog_t;
+ enum_pfx = "BPF_PROG_TYPE_";
+ break;
+ case OPT_DELEGATE_ATTACHS:
+ delegate_msk = &opts->delegate_attachs;
+ enum_t = info.attach_t;
+ enum_pfx = "BPF_";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ str = param->string;
+ while ((p = strsep(&str, ":"))) {
+ if (strcmp(p, "any") == 0) {
+ msk |= ~0ULL;
+ } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+ msk |= 1ULL << val;
+ } else {
+ err = kstrtou64(p, 0, &msk);
+ if (err)
+ return err;
+ }
+ }
+
+ /* Setting delegation mount options requires privileges */
+ if (msk && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *delegate_msk |= msk;
+ break;
+ }
+ default:
+ /* ignore unknown mount options */
+ break;
}
return 0;
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
}
struct bpf_preload_ops *bpf_preload_ops;
@@ -710,11 +972,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
static int populate_bpffs(struct dentry *parent)
{
struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
- struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
int err = 0, i;
/* grab the mutex to make sure the kernel interactions with bpf_preload
- * UMD are serialized
+ * are serialized
*/
mutex_lock(&bpf_preload_lock);
@@ -722,50 +983,36 @@ static int populate_bpffs(struct dentry *parent)
if (!bpf_preload_mod_get())
goto out;
- if (!bpf_preload_ops->info.tgid) {
- /* preload() will start UMD that will load BPF iterator programs */
- err = bpf_preload_ops->preload(objs);
- if (err)
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
goto out_put;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- links[i] = bpf_link_by_id(objs[i].link_id);
- if (IS_ERR(links[i])) {
- err = PTR_ERR(links[i]);
- goto out_put;
- }
}
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- err = bpf_iter_link_pin_kernel(parent,
- objs[i].link_name, links[i]);
- if (err)
- goto out_put;
- /* do not unlink successfully pinned links even
- * if later link fails to pin
- */
- links[i] = NULL;
- }
- /* finish() will tell UMD process to exit */
- err = bpf_preload_ops->finish();
- if (err)
- goto out_put;
}
out_put:
bpf_preload_mod_put();
out:
mutex_unlock(&bpf_preload_lock);
- for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
- if (!IS_ERR_OR_NULL(links[i]))
- bpf_link_put(links[i]);
return err;
}
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = sb->s_fs_info;
struct inode *inode;
int ret;
+ /* Mounting an instance of BPF FS requires privileges */
+ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -773,6 +1020,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &bpf_super_ops;
inode = sb->s_root->d_inode;
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
populate_bpffs(sb->s_root);
@@ -787,7 +1036,7 @@ static int bpf_get_tree(struct fs_context *fc)
static void bpf_free_fc(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ kfree(fc->s_fs_info);
}
static const struct fs_context_operations bpf_context_ops = {
@@ -808,18 +1057,35 @@ static int bpf_init_fs_context(struct fs_context *fc)
return -ENOMEM;
opts->mode = S_IRWXUGO;
+ opts->uid = current_fsuid();
+ opts->gid = current_fsgid();
- fc->fs_private = opts;
+ /* start out with no BPF token delegation enabled */
+ opts->delegate_cmds = 0;
+ opts->delegate_maps = 0;
+ opts->delegate_progs = 0;
+ opts->delegate_attachs = 0;
+
+ fc->s_fs_info = opts;
fc->ops = &bpf_context_ops;
return 0;
}
+static void bpf_kill_super(struct super_block *sb)
+{
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+
+ kill_anon_super(sb);
+ kfree(opts);
+}
+
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
.init_fs_context = bpf_init_fs_context,
.parameters = bpf_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = bpf_kill_super,
+ .fs_flags = FS_USERNS_MOUNT,
};
static int __init bpf_init(void)
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..3ae2158d767f
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+/* open-coded version */
+struct bpf_iter_kmem_cache {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_kmem_cache_kern {
+ struct kmem_cache *pos;
+} __attribute__((aligned(8)));
+
+#define KMEM_CACHE_POS_START ((void *)1L)
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->pos = KMEM_CACHE_POS_START;
+ return 0;
+}
+
+__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *prev = kit->pos;
+ struct kmem_cache *next;
+ bool destroy = false;
+
+ if (!prev)
+ return NULL;
+
+ mutex_lock(&slab_mutex);
+
+ if (list_empty(&slab_caches)) {
+ mutex_unlock(&slab_mutex);
+ return NULL;
+ }
+
+ if (prev == KMEM_CACHE_POS_START)
+ next = list_first_entry(&slab_caches, struct kmem_cache, list);
+ else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
+ next = NULL;
+ else
+ next = list_next_entry(prev, list);
+
+ /* boot_caches have negative refcount, don't touch them */
+ if (next && next->refcount > 0)
+ next->refcount++;
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (prev && prev != KMEM_CACHE_POS_START) {
+ if (prev->refcount > 1)
+ prev->refcount--;
+ else if (prev->refcount == 1)
+ destroy = true;
+ }
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(prev);
+
+ kit->pos = next;
+ return next;
+}
+
+__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *s = kit->pos;
+ bool destroy = false;
+
+ if (s == NULL || s == KMEM_CACHE_POS_START)
+ return;
+
+ mutex_lock(&slab_mutex);
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (s->refcount > 1)
+ s->refcount--;
+ else if (s->refcount == 1)
+ destroy = true;
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(s);
+}
+
+__bpf_kfunc_end_defs();
+
+struct bpf_iter__kmem_cache {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kmem_cache *, s);
+};
+
+union kmem_cache_iter_priv {
+ struct bpf_iter_kmem_cache it;
+ struct bpf_iter_kmem_cache_kern kit;
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t cnt = 0;
+ bool found = false;
+ struct kmem_cache *s;
+ union kmem_cache_iter_priv *p = seq->private;
+
+ mutex_lock(&slab_mutex);
+
+ /* Find an entry at the given position in the slab_caches list instead
+ * of keeping a reference (of the last visited entry, if any) out of
+ * slab_mutex. It might miss something if one is deleted in the middle
+ * while it releases the lock. But it should be rare and there's not
+ * much we can do about it.
+ */
+ list_for_each_entry(s, &slab_caches, list) {
+ if (cnt == *pos) {
+ /* Make sure this entry remains in the list by getting
+ * a new reference count. Note that boot_cache entries
+ * have a negative refcount, so don't touch them.
+ */
+ if (s->refcount > 0)
+ s->refcount++;
+ found = true;
+ break;
+ }
+ cnt++;
+ }
+ mutex_unlock(&slab_mutex);
+
+ if (!found)
+ s = NULL;
+
+ p->kit.pos = s;
+ return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ union kmem_cache_iter_priv *p = seq->private;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog && !ctx.s)
+ bpf_iter_run_prog(prog, &ctx);
+
+ bpf_iter_kmem_cache_destroy(&p->it);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ union kmem_cache_iter_priv *p = seq->private;
+
+ ++*pos;
+
+ return bpf_iter_kmem_cache_next(&p->it);
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+ .start = kmem_cache_iter_seq_start,
+ .next = kmem_cache_iter_seq_next,
+ .stop = kmem_cache_iter_seq_stop,
+ .show = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+ .seq_ops = &kmem_cache_iter_seq_ops,
+ .seq_priv_size = sizeof(union kmem_cache_iter_priv),
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+ struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+ .target = "kmem_cache",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__kmem_cache, s),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+ bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+ return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..8158e9c1af7b
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+ u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+ struct bpf_link *link;
+
+ link = bpf_link_get_curr_or_next(&info->link_id);
+ if (!link)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+
+ ++*pos;
+ ++info->link_id;
+ bpf_link_put((struct bpf_link *)v);
+ return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_link ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.link = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_link_seq_show(seq, v, true);
+ else
+ bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+ .start = bpf_link_seq_start,
+ .next = bpf_link_seq_next,
+ .stop = bpf_link_seq_stop,
+ .show = bpf_link_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+ .seq_ops = &bpf_link_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+ .target = "bpf_link",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_link, link),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+ bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+ return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
new file mode 100644
index 000000000000..60db5d655495
--- /dev/null
+++ b/kernel/bpf/liveness.c
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+/*
+ * This file implements live stack slots analysis. After accumulating
+ * stack usage data, the analysis answers queries about whether a
+ * particular stack slot may be read by an instruction or any of it's
+ * successors. This data is consumed by the verifier states caching
+ * mechanism to decide which stack slots are important when looking for a
+ * visited state corresponding to the current state.
+ *
+ * The analysis is call chain sensitive, meaning that data is collected
+ * and queried for tuples (call chain, subprogram instruction index).
+ * Such sensitivity allows identifying if some subprogram call always
+ * leads to writes in the caller's stack.
+ *
+ * The basic idea is as follows:
+ * - As the verifier accumulates a set of visited states, the analysis instance
+ * accumulates a conservative estimate of stack slots that can be read
+ * or must be written for each visited tuple (call chain, instruction index).
+ * - If several states happen to visit the same instruction with the same
+ * call chain, stack usage information for the corresponding tuple is joined:
+ * - "may_read" set represents a union of all possibly read slots
+ * (any slot in "may_read" set might be read at or after the instruction);
+ * - "must_write" set represents an intersection of all possibly written slots
+ * (any slot in "must_write" set is guaranteed to be written by the instruction).
+ * - The analysis is split into two phases:
+ * - read and write marks accumulation;
+ * - read and write marks propagation.
+ * - The propagation phase is a textbook live variable data flow analysis:
+ *
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
+ * state[cc, i].live_before =
+ * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
+ *
+ * Where:
+ * - `U` stands for set union
+ * - `/` stands for set difference;
+ * - `cc` stands for a call chain;
+ * - `i` and `s` are instruction indexes;
+ *
+ * The above equations are computed for each call chain and instruction
+ * index until state stops changing.
+ * - Additionally, in order to transfer "must_write" information from a
+ * subprogram to call instructions invoking this subprogram,
+ * the "must_write_acc" set is tracked for each (cc, i) tuple.
+ * A set of stack slots that are guaranteed to be written by this
+ * instruction or any of its successors (within the subprogram).
+ * The equation for "must_write_acc" propagation looks as follows:
+ *
+ * state[cc, i].must_write_acc =
+ * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
+ * U state[cc, i].must_write
+ *
+ * (An intersection of all "must_write_acc" for instruction successors
+ * plus all "must_write" slots for the instruction itself).
+ * - After the propagation phase completes for a subprogram, information from
+ * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
+ * - "must_write_acc" set is intersected with the call site's "must_write" set;
+ * - "may_read" set is added to the call site's "may_read" set.
+ * - Any live stack queries must be taken after the propagation phase.
+ * - Accumulation and propagation phases can be entered multiple times,
+ * at any point in time:
+ * - "may_read" set only grows;
+ * - "must_write" set only shrinks;
+ * - for each visited verifier state with zero branches, all relevant
+ * read and write marks are already recorded by the analysis instance.
+ *
+ * Technically, the analysis is facilitated by the following data structures:
+ * - Call chain: for given verifier state, the call chain is a tuple of call
+ * instruction indexes leading to the current subprogram plus the subprogram
+ * entry point index.
+ * - Function instance: for a given call chain, for each instruction in
+ * the current subprogram, a mapping between instruction index and a
+ * set of "may_read", "must_write" and other marks accumulated for this
+ * instruction.
+ * - A hash table mapping call chains to function instances.
+ */
+
+struct callchain {
+ u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
+ /* cached subprog_info[*].start for functions owning the frames:
+ * - sp_starts[curframe] used to get insn relative index within current function;
+ * - sp_starts[0..current-1] used for fast callchain_frame_up().
+ */
+ u32 sp_starts[MAX_CALL_FRAMES];
+ u32 curframe; /* depth of callsites and sp_starts arrays */
+};
+
+struct per_frame_masks {
+ u64 may_read; /* stack slots that may be read by this instruction */
+ u64 must_write; /* stack slots written by this instruction */
+ u64 must_write_acc; /* stack slots written by this instruction and its successors */
+ u64 live_before; /* stack slots that may be read by this insn and its successors */
+};
+
+/*
+ * A function instance created for a specific callchain.
+ * Encapsulates read and write marks for each instruction in the function.
+ * Marks are tracked for each frame in the callchain.
+ */
+struct func_instance {
+ struct hlist_node hl_node;
+ struct callchain callchain;
+ u32 insn_cnt; /* cached number of insns in the function */
+ bool updated;
+ bool must_write_dropped;
+ /* Per frame, per instruction masks, frames allocated lazily. */
+ struct per_frame_masks *frames[MAX_CALL_FRAMES];
+ /* For each instruction a flag telling if "must_write" had been initialized for it. */
+ bool *must_write_set;
+};
+
+struct live_stack_query {
+ struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 curframe;
+ u32 insn_idx;
+};
+
+struct bpf_liveness {
+ DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
+ /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
+ struct func_instance *cur_instance;
+ /*
+ * Below fields are used to accumulate stack write marks for instruction at
+ * @write_insn_idx before submitting the marks to @cur_instance.
+ */
+ u64 write_masks_acc[MAX_CALL_FRAMES];
+ u32 write_insn_idx;
+};
+
+/* Compute callchain corresponding to state @st at depth @frameno */
+static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
+ struct callchain *callchain, u32 frameno)
+{
+ struct bpf_subprog_info *subprog_info = env->subprog_info;
+ u32 i;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= frameno; i++) {
+ callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
+ if (i < st->curframe)
+ callchain->callsites[i] = st->frame[i + 1]->callsite;
+ }
+ callchain->curframe = frameno;
+ callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
+}
+
+static u32 hash_callchain(struct callchain *callchain)
+{
+ return jhash2(callchain->callsites, callchain->curframe, 0);
+}
+
+static bool same_callsites(struct callchain *a, struct callchain *b)
+{
+ int i;
+
+ if (a->curframe != b->curframe)
+ return false;
+ for (i = a->curframe; i >= 0; i--)
+ if (a->callsites[i] != b->callsites[i])
+ return false;
+ return true;
+}
+
+/*
+ * Find existing or allocate new function instance corresponding to @callchain.
+ * Instances are accumulated in env->liveness->func_instances and persist
+ * until the end of the verification process.
+ */
+static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
+ struct callchain *callchain)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct bpf_subprog_info *subprog;
+ struct func_instance *result;
+ u32 subprog_sz, size, key;
+
+ key = hash_callchain(callchain);
+ hash_for_each_possible(liveness->func_instances, result, hl_node, key)
+ if (same_callsites(&result->callchain, callchain))
+ return result;
+
+ subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
+ subprog_sz = (subprog + 1)->start - subprog->start;
+ size = sizeof(struct func_instance);
+ result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+ result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
+ GFP_KERNEL_ACCOUNT);
+ if (!result->must_write_set) {
+ kvfree(result);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(&result->callchain, callchain, sizeof(*callchain));
+ result->insn_cnt = subprog_sz;
+ hash_add(liveness->func_instances, &result->hl_node, key);
+ return result;
+}
+
+static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ u32 frameno)
+{
+ struct callchain callchain;
+
+ compute_callchain(env, st, &callchain, frameno);
+ return __lookup_instance(env, &callchain);
+}
+
+int bpf_stack_liveness_init(struct bpf_verifier_env *env)
+{
+ env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT);
+ if (!env->liveness)
+ return -ENOMEM;
+ hash_init(env->liveness->func_instances);
+ return 0;
+}
+
+void bpf_stack_liveness_free(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ struct hlist_node *tmp;
+ int bkt, i;
+
+ if (!env->liveness)
+ return;
+ hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance->must_write_set);
+ kvfree(instance);
+ }
+ kvfree(env->liveness);
+}
+
+/*
+ * Convert absolute instruction index @insn_idx to an index relative
+ * to start of the function corresponding to @instance.
+ */
+static int relative_idx(struct func_instance *instance, u32 insn_idx)
+{
+ return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+}
+
+static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ if (!instance->frames[frame])
+ return NULL;
+
+ return &instance->frames[frame][relative_idx(instance, insn_idx)];
+}
+
+static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ struct per_frame_masks *arr;
+
+ if (!instance->frames[frame]) {
+ arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT);
+ instance->frames[frame] = arr;
+ if (!arr)
+ return ERR_PTR(-ENOMEM);
+ }
+ return get_frame_masks(instance, frame, insn_idx);
+}
+
+void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
+{
+ env->liveness->cur_instance = NULL;
+}
+
+/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
+static int ensure_cur_instance(struct bpf_verifier_env *env)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *instance;
+
+ if (liveness->cur_instance)
+ return 0;
+
+ instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ liveness->cur_instance = instance;
+ return 0;
+}
+
+/* Accumulate may_read masks for @frame at @insn_idx */
+static int mark_stack_read(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+{
+ struct per_frame_masks *masks;
+ u64 new_may_read;
+
+ masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ new_may_read = masks->may_read | mask;
+ if (new_may_read != masks->may_read &&
+ ((new_may_read | masks->live_before) != masks->live_before))
+ instance->updated = true;
+ masks->may_read |= mask;
+ return 0;
+}
+
+int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
+{
+ int err;
+
+ err = ensure_cur_instance(env);
+ err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
+ return err;
+}
+
+static void reset_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int i;
+
+ liveness->write_insn_idx = insn_idx;
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ liveness->write_masks_acc[i] = 0;
+}
+
+int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int err;
+
+ err = ensure_cur_instance(env);
+ if (err)
+ return err;
+
+ reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ return 0;
+}
+
+void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
+{
+ env->liveness->write_masks_acc[frame] |= mask;
+}
+
+static int commit_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ u32 idx, frame, curframe, old_must_write;
+ struct per_frame_masks *masks;
+ u64 mask;
+
+ if (!instance)
+ return 0;
+
+ curframe = instance->callchain.curframe;
+ idx = relative_idx(instance, liveness->write_insn_idx);
+ for (frame = 0; frame <= curframe; frame++) {
+ mask = liveness->write_masks_acc[frame];
+ /* avoid allocating frames for zero masks */
+ if (mask == 0 && !instance->must_write_set[idx])
+ continue;
+ masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ old_must_write = masks->must_write;
+ /*
+ * If instruction at this callchain is seen for a first time, set must_write equal
+ * to @mask. Otherwise take intersection with the previous value.
+ */
+ if (instance->must_write_set[idx])
+ mask &= old_must_write;
+ if (old_must_write != mask) {
+ masks->must_write = mask;
+ instance->updated = true;
+ }
+ if (old_must_write & ~mask)
+ instance->must_write_dropped = true;
+ }
+ instance->must_write_set[idx] = true;
+ liveness->write_insn_idx = 0;
+ return 0;
+}
+
+/*
+ * Merge stack writes marks in @env->liveness->write_masks_acc
+ * with information already in @env->liveness->cur_instance.
+ */
+int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
+{
+ return commit_stack_write_marks(env, env->liveness->cur_instance);
+}
+
+static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
+{
+ char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int i;
+
+ buf += snprintf(buf, buf_end - buf, "(");
+ for (i = 0; i <= callchain->curframe; i++)
+ buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
+ snprintf(buf, buf_end - buf, ")");
+ return env->tmp_str_buf;
+}
+
+static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
+ char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
+{
+ u64 changed_bits = old ^ new;
+ u64 new_ones = new & changed_bits;
+ u64 new_zeros = ~new & changed_bits;
+
+ if (!changed_bits)
+ return;
+ bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
+ if (new_ones) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
+ bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
+ }
+ if (new_zeros) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
+ bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
+ }
+ bpf_log(&env->log, "\n");
+}
+
+int bpf_jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
+__diag_push();
+__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
+
+/*
+ * Returns an array of instructions succ, with succ->items[0], ...,
+ * succ->items[n-1] with successor instructions, where n=succ->cnt
+ */
+inline struct bpf_iarray *
+bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
+{
+ static const struct opcode_info {
+ bool can_jump;
+ bool can_fallthrough;
+ } opcode_info_tbl[256] = {
+ [0 ... 255] = {.can_jump = false, .can_fallthrough = true},
+ #define _J(code, ...) \
+ [BPF_JMP | code] = __VA_ARGS__, \
+ [BPF_JMP32 | code] = __VA_ARGS__
+
+ _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}),
+ _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}),
+ _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
+ #undef _J
+ };
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ const struct opcode_info *opcode_info;
+ struct bpf_iarray *succ, *jt;
+ int insn_sz;
+
+ jt = env->insn_aux_data[idx].jt;
+ if (unlikely(jt))
+ return jt;
+
+ /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
+ succ = env->succ;
+ succ->cnt = 0;
+
+ opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (opcode_info->can_fallthrough)
+ succ->items[succ->cnt++] = idx + insn_sz;
+
+ if (opcode_info->can_jump)
+ succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
+
+ return succ;
+}
+
+__diag_pop();
+
+static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain callchain = instance->callchain;
+
+ /* Adjust @callchain to represent callchain one frame up */
+ callchain.callsites[callchain.curframe] = 0;
+ callchain.sp_starts[callchain.curframe] = 0;
+ callchain.curframe--;
+ callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
+ return __lookup_instance(env, &callchain);
+}
+
+static u32 callchain_subprog_start(struct callchain *callchain)
+{
+ return callchain->sp_starts[callchain->curframe];
+}
+
+/*
+ * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
+ * to the call instruction in function instance calling @instance.
+ */
+static int propagate_to_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain *callchain = &instance->callchain;
+ u32 this_subprog_start, callsite, frame;
+ struct func_instance *outer_instance;
+ struct per_frame_masks *insn;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ outer_instance = get_outer_instance(env, instance);
+ if (IS_ERR(outer_instance))
+ return PTR_ERR(outer_instance);
+ callsite = callchain->callsites[callchain->curframe - 1];
+
+ reset_stack_write_marks(env, outer_instance, callsite);
+ for (frame = 0; frame < callchain->curframe; frame++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start);
+ if (!insn)
+ continue;
+ bpf_mark_stack_write(env, frame, insn->must_write_acc);
+ err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
+ if (err)
+ return err;
+ }
+ commit_stack_write_marks(env, outer_instance);
+ return 0;
+}
+
+static inline bool update_insn(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx)
+{
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ u64 new_before, new_after, must_write_acc;
+ struct per_frame_masks *insn, *succ_insn;
+ struct bpf_iarray *succ;
+ u32 s;
+ bool changed;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ if (succ->cnt == 0)
+ return false;
+
+ changed = false;
+ insn = get_frame_masks(instance, frame, insn_idx);
+ new_before = 0;
+ new_after = 0;
+ /*
+ * New "must_write_acc" is an intersection of all "must_write_acc"
+ * of successors plus all "must_write" slots of instruction itself.
+ */
+ must_write_acc = U64_MAX;
+ for (s = 0; s < succ->cnt; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ->items[s]);
+ new_after |= succ_insn->live_before;
+ must_write_acc &= succ_insn->must_write_acc;
+ }
+ must_write_acc |= insn->must_write;
+ /*
+ * New "live_before" is a union of all "live_before" of successors
+ * minus slots written by instruction plus slots read by instruction.
+ */
+ new_before = (new_after & ~insn->must_write) | insn->may_read;
+ changed |= new_before != insn->live_before;
+ changed |= must_write_acc != insn->must_write_acc;
+ if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
+ (insn->may_read || insn->must_write ||
+ insn_idx == callchain_subprog_start(&instance->callchain) ||
+ aux[insn_idx].prune_point)) {
+ log_mask_change(env, &instance->callchain, "live",
+ frame, insn_idx, insn->live_before, new_before);
+ log_mask_change(env, &instance->callchain, "written",
+ frame, insn_idx, insn->must_write_acc, must_write_acc);
+ }
+ insn->live_before = new_before;
+ insn->must_write_acc = must_write_acc;
+ return changed;
+}
+
+/* Fixed-point computation of @live_before and @must_write_acc marks */
+static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ u32 i, frame, po_start, po_end, cnt, this_subprog_start;
+ struct callchain *callchain = &instance->callchain;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_subprog_info *subprog;
+ struct per_frame_masks *insn;
+ bool changed;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ /*
+ * If must_write marks were updated must_write_acc needs to be reset
+ * (to account for the case when new must_write sets became smaller).
+ */
+ if (instance->must_write_dropped) {
+ for (frame = 0; frame <= callchain->curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = 0; i < instance->insn_cnt; i++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start + i);
+ insn->must_write_acc = 0;
+ }
+ }
+ }
+
+ subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ po_start = subprog->postorder_start;
+ po_end = (subprog + 1)->postorder_start;
+ cnt = 0;
+ /* repeat until fixed point is reached */
+ do {
+ cnt++;
+ changed = false;
+ for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = po_start; i < po_end; i++)
+ changed |= update_insn(env, instance, frame, insn_postorder[i]);
+ }
+ } while (changed);
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ bpf_log(&env->log, "%s live stack update done in %d iterations\n",
+ fmt_callchain(env, callchain), cnt);
+
+ /* transfer marks accumulated for outer frames to outer func instance (caller) */
+ if (callchain->curframe > 0) {
+ err = propagate_to_outer_instance(env, instance);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Prepare all callchains within @env->cur_state for querying.
+ * This function should be called after each verifier.c:pop_stack()
+ * and whenever verifier.c:do_check_insn() processes subprogram exit.
+ * This would guarantee that visited verifier states with zero branches
+ * have their bpf_mark_stack_{read,write}() effects propagated in
+ * @env->liveness.
+ */
+int bpf_update_live_stack(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ int err, frame;
+
+ bpf_reset_live_stack_callchain(env);
+ for (frame = env->cur_state->curframe; frame >= 0; --frame) {
+ instance = lookup_instance(env, env->cur_state, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ if (instance->updated) {
+ err = update_instance(env, instance);
+ if (err)
+ return err;
+ instance->updated = false;
+ instance->must_write_dropped = false;
+ }
+ }
+ return 0;
+}
+
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+{
+ struct per_frame_masks *masks;
+
+ masks = get_frame_masks(instance, frameno, insn_idx);
+ return masks && (masks->live_before & BIT(spi));
+}
+
+int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance;
+ u32 frame;
+
+ memset(q, 0, sizeof(*q));
+ for (frame = 0; frame <= st->curframe; frame++) {
+ instance = lookup_instance(env, st, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+ q->instances[frame] = instance;
+ }
+ q->curframe = st->curframe;
+ q->insn_idx = st->insn_idx;
+ return 0;
+}
+
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+{
+ /*
+ * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * or if for some outer func instance:
+ * - alive before callsite if callsite calls callback, otherwise
+ * - alive after callsite
+ */
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance, *curframe_instance;
+ u32 i, callsite;
+ bool alive;
+
+ curframe_instance = q->instances[q->curframe];
+ if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ return true;
+
+ for (i = frameno; i < q->curframe; i++) {
+ callsite = curframe_instance->callchain.callsites[i];
+ instance = q->instances[i];
+ alive = bpf_calls_callback(env, callsite)
+ ? is_live_before(instance, callsite, frameno, spi)
+ : is_live_before(instance, callsite + 1, frameno, spi);
+ if (alive)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 23f7f9d08a62..c93a756e035c 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
@@ -9,6 +9,7 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_CGROUP_BPF
@@ -140,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
@@ -150,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -164,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
- __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
+ __GFP_ZERO | GFP_NOWAIT,
map->numa_node);
if (!new)
return -ENOMEM;
@@ -312,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -333,22 +333,22 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
struct list_head *storages = &map->list;
struct bpf_cgroup_storage *storage, *stmp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_for_each_entry_safe(storage, stmp, storages, list_map) {
bpf_cgroup_storage_unlink(storage);
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
return -EINVAL;
} else {
- u32 int_data;
-
/*
* Key is expected to be u64, which stores the cgroup_inode_id
*/
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+ if (!btf_type_is_i64(key_type))
return -EINVAL;
}
@@ -431,7 +424,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id,
&READ_ONCE(storage->buf)->data[0], m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
} else {
seq_puts(m, ": {\n");
for_each_possible_cpu(cpu) {
@@ -439,14 +432,21 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(storage->percpu_buf, cpu),
m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
}
rcu_read_unlock();
}
-static int cgroup_storage_map_btf_id;
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+ bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -456,8 +456,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
- .map_btf_name = "bpf_cgroup_storage_map",
- .map_btf_id = &cgroup_storage_map_btf_id,
+ .map_mem_usage = cgroup_storage_map_usage,
+ .map_btf_id = &cgroup_storage_map_btf_ids[0],
};
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..a0c3b35de2ce
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,865 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+#include <linux/string.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 nr_linfo;
+ int l, r, m;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
+
+ return &linfo[l];
+}
+
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo, *prev_linfo;
+ const struct btf *btf;
+ const char *s, *fname;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ prev_linfo = env->prev_linfo;
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == prev_linfo)
+ return;
+
+ /* It often happens that two separate linfo records point to the same
+ * source code line, but have differing column numbers. Given verifier
+ * log doesn't emit column information, from user perspective we just
+ * end up emitting the same source code line twice unnecessarily.
+ * So instead check that previous and current linfo record point to
+ * the same file (file_name_offs match) and the same line number, and
+ * avoid emitting duplicated source code line in such case.
+ */
+ if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
+ BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ btf = env->prog->aux->btf;
+ s = ltrim(btf_name_by_offset(btf, linfo->line_off));
+ verbose(env, "%s", s); /* source code line */
+
+ s = btf_name_by_offset(btf, linfo->file_name_off);
+ /* leave only file name */
+ fname = strrchr(s, '/');
+ fname = fname ? fname + 1 : s;
+ verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));
+
+ env->prev_linfo = linfo;
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[64] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_ARENA] = "arena",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_INSN] = "insn",
+ [PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strscpy(postfix, "or_null_");
+ else
+ strscpy(postfix, "_or_null");
+ }
+
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
+
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return "skb_meta";
+ case BPF_DYNPTR_TYPE_FILE:
+ return "file";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
+static char slot_type_char[] = {
+ [STACK_INVALID] = '?',
+ [STACK_SPILL] = 'r',
+ [STACK_MISC] = 'm',
+ [STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
+ [STACK_IRQ_FLAG] = 'f'
+};
+
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
+static bool is_unum_decimal(u64 num)
+{
+ return num <= UNUM_MAX_DECIMAL;
+}
+
+static bool is_snum_decimal(s64 num)
+{
+ return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
+}
+
+static void verbose_unum(struct bpf_verifier_env *env, u64 num)
+{
+ if (is_unum_decimal(num))
+ verbose(env, "%llu", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+static void verbose_snum(struct bpf_verifier_env *env, s64 num)
+{
+ if (is_snum_decimal(num))
+ verbose(env, "%lld", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ /* print as a constant, if tnum is fully known */
+ if (a.mask == 0) {
+ if (is_unum_decimal(a.value))
+ return snprintf(str, size, "%llu", a.value);
+ else
+ return snprintf(str, size, "%#llx", a.value);
+ }
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char **sep)
+{
+ /* For signed ranges, we want to unify 64-bit and 32-bit values in the
+ * output as much as possible, but there is a bit of a complication.
+ * If we choose to print values as decimals, this is natural to do,
+ * because negative 64-bit and 32-bit values >= -S32_MIN have the same
+ * representation due to sign extension. But if we choose to print
+ * them in hex format (see is_snum_decimal()), then sign extension is
+ * misleading.
+ * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
+ * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
+ * very different numbers.
+ * So we avoid sign extension if we choose to print values in hex.
+ */
+ struct {
+ const char *name;
+ u64 val;
+ bool omit;
+ } minmaxs[] = {
+ {"smin", reg->smin_value, reg->smin_value == S64_MIN},
+ {"smax", reg->smax_value, reg->smax_value == S64_MAX},
+ {"umin", reg->umin_value, reg->umin_value == 0},
+ {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin32",
+ is_snum_decimal((s64)reg->s32_min_value)
+ ? (s64)reg->s32_min_value
+ : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ {"smax32",
+ is_snum_decimal((s64)reg->s32_max_value)
+ ? (s64)reg->s32_max_value
+ : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+ {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
+ {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+ bool neg1, neg2;
+
+ for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+ if (m1->omit)
+ continue;
+
+ neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+ verbose(env, "%s%s=", *sep, m1->name);
+ *sep = ",";
+
+ for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+ if (m2->omit || m2->val != m1->val)
+ continue;
+ /* don't mix negatives with positives */
+ neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+ if (neg2 != neg1)
+ continue;
+ m2->omit = true;
+ verbose(env, "%s=", m2->name);
+ }
+
+ if (m1->name[0] == 's')
+ verbose_snum(env, m1->val);
+ else
+ verbose_unum(env, m1->val);
+ }
+}
+
+static bool type_is_map_ptr(enum bpf_reg_type t) {
+ switch (base_type(t)) {
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
+static void print_reg_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state,
+ const struct bpf_reg_state *reg)
+{
+ enum bpf_reg_type t;
+ const char *sep = "";
+
+ t = reg->type;
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value);
+ return;
+ }
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_ARENA)
+ return;
+ if (t == PTR_TO_STACK) {
+ if (state->frameno != reg->frameno)
+ verbose(env, "[%d]", reg->frameno);
+ if (tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+ }
+ if (base_type(t) == PTR_TO_BTF_ID)
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+ verbose(env, "(");
+ if (reg->id)
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
+ if (reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
+ if (type_is_map_ptr(t)) {
+ if (reg->map_ptr->name[0])
+ verbose_a("map=%s", reg->map_ptr->name);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ }
+ if (t != SCALAR_VALUE && reg->off) {
+ verbose_a("off=");
+ verbose_snum(env, reg->off);
+ }
+ if (type_is_pkt_pointer(t)) {
+ verbose_a("r=");
+ verbose_unum(env, reg->range);
+ }
+ if (base_type(t) == PTR_TO_MEM) {
+ verbose_a("sz=");
+ verbose_unum(env, reg->mem_size);
+ }
+ if (t == CONST_PTR_TO_DYNPTR)
+ verbose_a("type=%s", dynptr_type_str(reg->dynptr.type));
+ if (tnum_is_const(reg->var_off)) {
+ /* a pointer register with fixed offset */
+ if (reg->var_off.value) {
+ verbose_a("imm=");
+ verbose_snum(env, reg->var_off.value);
+ }
+ } else {
+ print_scalar_ranges(env, reg, &sep);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose_a("var_off=%s", tn_buf);
+ }
+ }
+ verbose(env, ")");
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno, bool print_all)
+{
+ const struct bpf_func_state *state = vstate->frame[frameno];
+ const struct bpf_reg_state *reg;
+ int i;
+
+ if (state->frameno)
+ verbose(env, " frame%d:", state->frameno);
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == NOT_INIT)
+ continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
+ verbose(env, " R%d", i);
+ verbose(env, "=");
+ print_reg_state(env, state, reg);
+ }
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ char types_buf[BPF_REG_SIZE + 1];
+ const char *sep = "";
+ bool valid = false;
+ u8 slot_type;
+ int j;
+
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
+
+ for (j = 0; j < BPF_REG_SIZE; j++) {
+ slot_type = state->stack[i].slot_type[j];
+ if (slot_type != STACK_INVALID)
+ valid = true;
+ types_buf[j] = slot_type_char[slot_type];
+ }
+ types_buf[BPF_REG_SIZE] = 0;
+ if (!valid)
+ continue;
+
+ reg = &state->stack[i].spilled_ptr;
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* print MISC/ZERO/INVALID slots above subreg spill */
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (state->stack[i].slot_type[j] == STACK_SPILL)
+ break;
+ types_buf[j] = '\0';
+
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
+ print_reg_state(env, state, reg);
+ break;
+ case STACK_DYNPTR:
+ /* skip to main dynptr slot */
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_id=%d", reg->ref_obj_id);
+ if (reg->dynptr_id)
+ verbose_a("dynptr_id=%d", reg->dynptr_id);
+ verbose(env, ")");
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ (-i - 1) * BPF_REG_SIZE,
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
+ break;
+ }
+ }
+ if (vstate->acquired_refs && vstate->refs[0].id) {
+ verbose(env, " refs=%d", vstate->refs[0].id);
+ for (i = 1; i < vstate->acquired_refs; i++)
+ if (vstate->refs[i].id)
+ verbose(env, ",%d", vstate->refs[i].id);
+ }
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
+ verbose(env, "\n");
+ if (!print_all)
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno)
+{
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, vstate, frameno, false);
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 5763cc7ac4f1..be66d7e520e0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -14,6 +14,9 @@
#include <linux/vmalloc.h>
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
+#include <linux/bpf_mem_alloc.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -21,7 +24,6 @@
struct lpm_trie_node;
struct lpm_trie_node {
- struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
@@ -31,10 +33,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
+ struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
- spinlock_t lock;
+ rqspinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
@@ -154,22 +157,23 @@ static inline int extract_bit(const u8 *data, size_t index)
}
/**
- * longest_prefix_match() - determine the longest prefix
+ * __longest_prefix_match() - determine the longest prefix
* @trie: The trie to get internal sizes from
* @node: The node to operate on
* @key: The key to compare to @node
*
* Determine the longest prefix of @node that matches the bits in @key.
*/
-static size_t longest_prefix_match(const struct lpm_trie *trie,
- const struct lpm_trie_node *node,
- const struct bpf_lpm_trie_key *key)
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
{
u32 limit = min(node->prefixlen, key->prefixlen);
u32 prefixlen = 0, i = 0;
BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
- BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
@@ -223,12 +227,22 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
return prefixlen;
}
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ return __longest_prefix_match(trie, node, key);
+}
+
/* Called from syscall or from eBPF program */
static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *found = NULL;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return NULL;
/* Start walking the trie from the root node ... */
@@ -241,7 +255,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* If it's the maximum possible prefix for this trie, we have
* an exact match and can return it directly.
*/
- matchlen = longest_prefix_match(trie, node, key);
+ matchlen = __longest_prefix_match(trie, node, key);
if (matchlen == trie->max_prefixlen) {
found = node;
break;
@@ -275,17 +289,13 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value)
{
struct lpm_trie_node *node;
- size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
- if (value)
- size += trie->map.value_size;
+ node = bpf_mem_cache_alloc(&trie->ma);
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
- trie->map.numa_node);
if (!node)
return NULL;
@@ -298,14 +308,25 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}
+static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
+{
+ if (flags == BPF_EXIST)
+ return -ENOENT;
+ if (trie->n_entries == trie->map.max_entries)
+ return -ENOSPC;
+ trie->n_entries++;
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node *node, *im_node, *new_node;
+ struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
unsigned long irq_flags;
unsigned int next_bit;
size_t matchlen = 0;
@@ -317,22 +338,14 @@ static int trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
-
/* Allocate and fill a new node */
-
- if (trie->n_entries == trie->map.max_entries) {
- ret = -ENOSPC;
- goto out;
- }
-
new_node = lpm_trie_node_alloc(trie, value);
- if (!new_node) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_node)
+ return -ENOMEM;
- trie->n_entries++;
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ goto out_free;
new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -346,13 +359,11 @@ static int trie_update_elem(struct bpf_map *map,
*/
slot = &trie->root;
- while ((node = rcu_dereference_protected(*slot,
- lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*slot))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
- node->prefixlen == key->prefixlen ||
- node->prefixlen == trie->max_prefixlen)
+ node->prefixlen == key->prefixlen)
break;
next_bit = extract_bit(key->data, node->prefixlen);
@@ -363,6 +374,10 @@ static int trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
rcu_assign_pointer(*slot, new_node);
goto out;
}
@@ -371,18 +386,30 @@ static int trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ if (flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+ }
+
new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
- trie->n_entries--;
-
rcu_assign_pointer(*slot, new_node);
- kfree_rcu(node, rcu);
+ free_node = node;
goto out;
}
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
@@ -395,6 +422,7 @@ static int trie_update_elem(struct bpf_map *map,
im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
+ trie->n_entries--;
ret = -ENOMEM;
goto out;
}
@@ -416,24 +444,21 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);
out:
- if (ret) {
- if (new_node)
- trie->n_entries--;
-
- kfree(new_node);
- kfree(im_node);
- }
-
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+out_free:
+ if (ret)
+ bpf_mem_cache_free(&trie->ma, new_node);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
+ struct bpf_lpm_trie_key_u8 *key = _key;
struct lpm_trie_node __rcu **trim, **trim2;
struct lpm_trie_node *node, *parent;
unsigned long irq_flags;
@@ -444,7 +469,9 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ return ret;
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
@@ -455,8 +482,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
trim = &trie->root;
trim2 = trim;
parent = NULL;
- while ((node = rcu_dereference_protected(
- *trim, lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*trim))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -502,8 +528,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
else
rcu_assign_pointer(
*trim2, rcu_access_pointer(parent->child[0]));
- kfree_rcu(parent, rcu);
- kfree_rcu(node, rcu);
+ free_parent = parent;
+ free_node = node;
goto out;
}
@@ -517,10 +543,13 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
else
RCU_INIT_POINTER(*trim, NULL);
- kfree_rcu(node, rcu);
+ free_node = node;
out:
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
@@ -532,7 +561,7 @@ out:
sizeof(struct lpm_trie_node))
#define LPM_VAL_SIZE_MIN 1
-#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
@@ -542,9 +571,8 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
+ size_t leaf_size;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 ||
@@ -557,19 +585,29 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
- offsetof(struct bpf_lpm_trie_key, data);
+ offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
- spin_lock_init(&trie->lock);
+ raw_res_spin_lock_init(&trie->lock);
+ /* Allocate intermediate and leaf nodes from the same allocator */
+ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+ if (err)
+ goto free_out;
return &trie->map;
+
+free_out:
+ bpf_map_area_free(trie);
+ return ERR_PTR(err);
}
static void trie_free(struct bpf_map *map)
@@ -601,25 +639,29 @@ static void trie_free(struct bpf_map *map)
continue;
}
- kfree(node);
+ /* No bpf program may access the map, so freeing the
+ * node without waiting for the extra RCU GP.
+ */
+ bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}
out:
- kfree(trie);
+ bpf_mem_alloc_destroy(&trie->ma);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
{
struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+ struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
- size_t matchlen;
+ size_t matchlen = 0;
/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
@@ -641,7 +683,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
@@ -658,7 +700,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
- if (!node || node->prefixlen != key->prefixlen ||
+ if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;
@@ -702,7 +744,7 @@ find_leftmost:
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
- memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
next_node->data, trie->data_size);
free_stack:
kfree(node_stack);
@@ -714,12 +756,22 @@ static int trie_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- /* Keys must have struct bpf_lpm_trie_key embedded. */
+ /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
-EINVAL : 0;
}
-static int trie_map_btf_id;
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = trie_alloc,
@@ -732,6 +784,6 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
- .map_btf_name = "lpm_trie",
- .map_btf_id = &trie_map_btf_id,
+ .map_mem_usage = trie_mem_usage,
+ .map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5cd8f5277279..645bd30bc9a9 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -11,47 +11,49 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
- struct fd f;
+ CLASS(fd, f)(inner_map_ufd);
- f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
/* Does not support >1 level map-in-map */
- if (inner_map->inner_map_meta) {
- fdput(f);
+ if (inner_map->inner_map_meta)
return ERR_PTR(-EINVAL);
- }
- if (!inner_map->ops->map_meta_equal) {
- fdput(f);
+ if (!inner_map->ops->map_meta_equal)
return ERR_PTR(-ENOTSUPP);
- }
-
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
- if (inner_map->ops == &array_map_ops)
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops)
inner_map_meta_size = sizeof(struct bpf_array);
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
- if (!inner_map_meta) {
- fdput(f);
+ if (!inner_map_meta)
return ERR_PTR(-ENOMEM);
- }
inner_map_meta->map_type = inner_map->map_type;
inner_map_meta->key_size = inner_map->key_size;
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
- inner_map_meta->timer_off = inner_map->timer_off;
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
+ kfree(inner_map_meta);
+ return ret;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
@@ -59,18 +61,21 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
- if (inner_map->ops == &array_map_ops) {
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) {
+ struct bpf_array *inner_array_meta =
+ container_of(inner_map_meta, struct bpf_array, map);
+ struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
+
+ inner_array_meta->index_mask = inner_array->index_mask;
+ inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
- container_of(inner_map_meta, struct bpf_array, map)->index_mask =
- container_of(inner_map, struct bpf_array, map)->index_mask;
}
-
- fdput(f);
return inner_map_meta;
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -82,8 +87,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->timer_off == meta1->timer_off &&
- meta0->map_flags == meta1->map_flags;
+ meta0->map_flags == meta1->map_flags &&
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
@@ -91,9 +96,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
int ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
- struct fd f;
+ CLASS(fd, f)(ufd);
- f = fdget(ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
@@ -104,16 +108,24 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
else
inner_map = ERR_PTR(-EINVAL);
- fdput(f);
return inner_map;
}
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
- /* ptr->ops->map_free() has to go through one
- * rcu grace period by itself.
+ struct bpf_map *inner_map = ptr;
+
+ /* Defer the freeing of inner map according to the sleepable attribute
+ * of bpf program which owns the outer map, so unnecessary waiting for
+ * RCU tasks trace grace period can be avoided.
*/
- bpf_map_put(ptr);
+ if (need_defer) {
+ if (atomic64_read(&map->sleepable_refcnt))
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ else
+ WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+ }
+ bpf_map_put(inner_map);
}
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index b0fa190b0979..9575314f40a6 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_map_seq_ops = {
.show = bpf_map_seq_show,
};
-BTF_ID_LIST(btf_bpf_map_id)
-BTF_ID(struct, bpf_map)
+BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map)
static const struct bpf_iter_seq_info bpf_map_seq_info = {
.seq_ops = &bpf_map_seq_ops,
@@ -93,7 +92,7 @@ static struct bpf_iter_reg bpf_map_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &bpf_map_seq_info,
};
@@ -193,3 +192,38 @@ static int __init bpf_map_iter_init(void)
}
late_initcall(bpf_map_iter_init);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
+{
+ s64 *pcount;
+ s64 ret = 0;
+ int cpu;
+
+ if (!map || !map->elem_count)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(map->elem_count, cpu);
+ ret += READ_ONCE(*pcount);
+ }
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_map_iter_kfunc_ids,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..bd45dda9dc35
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 2;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+ bool draining;
+ struct bpf_mem_cache *tgt;
+
+ /* list of objects to be freed after RCU GP */
+ struct llist_head free_by_rcu;
+ struct llist_node *free_by_rcu_tail;
+ struct llist_head waiting_for_gp;
+ struct llist_node *waiting_for_gp_tail;
+ struct rcu_head rcu;
+ atomic_t call_rcu_in_progress;
+ struct llist_head free_llist_extra_rcu;
+
+ /* list of objects to be freed after RCU tasks trace GP */
+ struct llist_head free_by_rcu_ttrace;
+ struct llist_head waiting_for_gp_ttrace;
+ struct rcu_head rcu_ttrace;
+ atomic_t call_rcu_ttrace_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
+{
+ if (c->percpu_size) {
+ void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
+ void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(*flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+}
+
+static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
+{
+ unsigned long flags;
+
+ inc_active(c, &flags);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ dec_active(c, &flags);
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ gfp_t gfp;
+ void *obj;
+ int i;
+
+ gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+ gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
+ for (i = 0; i < cnt; i++) {
+ /*
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
+ */
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ for (; i < cnt; i++) {
+ obj = llist_del_first(&c->waiting_for_gp_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (; i < cnt; i++) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, gfp);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(void *obj, bool percpu)
+{
+ if (percpu)
+ free_percpu(((void __percpu **)obj)[1]);
+
+ kfree(obj);
+}
+
+static int free_all(struct llist_node *llnode, bool percpu)
+{
+ struct llist_node *pos, *t;
+ int cnt = 0;
+
+ llist_for_each_safe(pos, t, llnode) {
+ free_one(pos, percpu);
+ cnt++;
+ }
+ return cnt;
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
+
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ atomic_set(&c->call_rcu_ttrace_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu_ttrace list.
+ */
+ llist_add(llnode, &c->free_by_rcu_ttrace);
+}
+
+static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+ if (unlikely(READ_ONCE(c->draining))) {
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
+ free_all(llnode, !!c->percpu_size);
+ }
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
+ llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ __free_rcu(&c->rcu_ttrace);
+ return;
+ }
+
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ do {
+ inc_active(c, &flags);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ dec_active(c, &flags);
+ if (llnode)
+ enque_to_free(tgt, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
+}
+
+static void __free_by_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ llnode = llist_del_all(&c->waiting_for_gp);
+ if (!llnode)
+ goto out;
+
+ llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+ /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+ do_call_rcu_ttrace(tgt);
+out:
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+
+ /* drain free_llist_extra_rcu */
+ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+ inc_active(c, &flags);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ dec_active(c, &flags);
+ }
+
+ if (llist_empty(&c->free_by_rcu))
+ return;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+ /*
+ * Instead of kmalloc-ing new rcu_head and triggering 10k
+ * call_rcu() to hit rcutree.qhimark and force RCU to notice
+ * the overload just ask RCU to hurry up. There could be many
+ * objects in free_by_rcu list.
+ * This hint reduces memory consumption for an artificial
+ * benchmark from 2 Gbyte to 150 Mbyte.
+ */
+ rcu_request_urgent_qs_task(current);
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+ inc_active(c, &flags);
+ WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+ c->waiting_for_gp_tail = c->free_by_rcu_tail;
+ dec_active(c, &flags);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ atomic_set(&c->call_rcu_in_progress, 0);
+ } else {
+ call_rcu_hurry(&c->rcu, __free_by_rcu);
+ }
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+
+ check_free_by_rcu(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ *
+ * Percpu allocation is typically rare. To avoid potential unnecessary large
+ * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
+ */
+static void init_refill_work(struct bpf_mem_cache *c)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->percpu_size) {
+ c->low_watermark = 1;
+ c->high_watermark = 3;
+ } else if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ int cnt = 1;
+
+ /* To avoid consuming memory, for non-percpu allocation, assume that
+ * 1st run of bpf prog won't be doing more than 4 map_update_elem from
+ * irq disabled region if unit size is less than or equal to 256.
+ * For all other cases, let us just do one allocation.
+ */
+ if (!c->percpu_size && c->unit_size <= 256)
+ cnt = 4;
+ alloc_bulk(c, cnt, cpu_to_node(cpu), false);
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (percpu && size == 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (!percpu)
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ }
+
+ ma->caches = pcc;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+ struct bpf_mem_caches __percpu *pcc;
+
+ pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+
+ ma->caches = pcc;
+ ma->objcg = objcg;
+ ma->percpu = true;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ int cpu, i, unit_size, percpu_size;
+ struct obj_cgroup *objcg;
+ struct bpf_mem_cache *c;
+
+ i = bpf_mem_cache_idx(size);
+ if (i < 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+ unit_size = sizes[i];
+ objcg = ma->objcg;
+ pcc = ma->caches;
+
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ c = &cc->cache[i];
+ if (c->unit_size)
+ break;
+
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ bool percpu = !!c->percpu_size;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
+ *
+ * Except for waiting_for_gp_ttrace list, there are no concurrent operations
+ * on these lists, so it is safe to use __llist_del_all().
+ */
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+}
+
+static void check_mem_cache(struct bpf_mem_cache *c)
+{
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ check_mem_cache(c);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ check_mem_cache(c);
+ }
+ }
+ }
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ check_leaked_objs(ma);
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ * might still execute. Wait for them.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
+ */
+ rcu_barrier(); /* wait for __free_by_rcu */
+ rcu_barrier_tasks_trace(); /* wait for __free_rcu */
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ memset(ma, 0, sizeof(*ma));
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_dfl_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode) {
+ cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
+ }
+ local_dec(&c->active);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
+}
+
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ } else {
+ llist_add(llnode, &c->free_llist_extra_rcu);
+ }
+ local_dec(&c->active);
+
+ if (!atomic_read(&c->call_rcu_in_progress))
+ irq_work_raise(c);
+ local_irq_restore(flags);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return NULL;
+
+ if (!ma->percpu)
+ size += LLIST_NODE_SZ;
+ idx = bpf_mem_cache_idx(size);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
+
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (ret)
+ *(struct bpf_mem_cache **)ret = c;
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
new file mode 100644
index 000000000000..1394168062e8
--- /dev/null
+++ b/kernel/bpf/mprog.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+static int bpf_mprog_link(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ if (type && link->prog->type != type) {
+ bpf_link_put(link);
+ return -EINVAL;
+ }
+
+ tuple->link = link;
+ tuple->prog = link->prog;
+ return 0;
+}
+
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
+
+static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool link = flags & BPF_F_LINK;
+ bool id = flags & BPF_F_ID;
+
+ memset(tuple, 0, sizeof(*tuple));
+ if (link)
+ return bpf_mprog_link(tuple, id_or_fd, flags, type);
+ /* If no relevant flag is set and no id_or_fd was passed, then
+ * tuple link/prog is just NULLed. This is the case when before/
+ * after selects first/last position without passing fd.
+ */
+ if (!id && !id_or_fd)
+ return 0;
+ return bpf_mprog_prog(tuple, id_or_fd, flags, type);
+}
+
+static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
+{
+ if (tuple->link)
+ bpf_link_put(tuple->link);
+ else if (tuple->prog)
+ bpf_prog_put(tuple->prog);
+}
+
+/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
+ * one exception that for deletion we support delete from front/back. In
+ * case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
+ * Adjustment to first and last entry is trivial. The bpf_mprog_insert()
+ * we have to deal with the following cases:
+ *
+ * idx + before:
+ *
+ * Insert P4 before P3: idx for old array is 1, idx for new array is 2,
+ * hence we adjust target idx for the new array, so that memmove copies
+ * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
+ * before P1 would have old idx -1 and new idx 0.
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ *
+ * idx + after:
+ *
+ * Insert P4 after P2: idx for old array is 2, idx for new array is 2.
+ * Again, memmove copies P1 and P2 to the new entry, and we insert P4
+ * into idx 2. Inserting after P3 would have both old/new idx at 4 aka
+ * bpf_mprog_total(entry).
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ */
+static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *oprog;
+
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ oprog = READ_ONCE(fp->prog);
+ bpf_mprog_write(fp, cp, ntuple);
+ if (!ntuple->link) {
+ WARN_ON_ONCE(cp->link);
+ bpf_prog_put(oprog);
+ }
+ *entry_new = entry;
+ return 0;
+}
+
+static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx, u32 flags)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == total)
+ goto insert;
+ else if (flags & BPF_F_BEFORE)
+ idx += 1;
+ bpf_mprog_entry_grow(peer, idx);
+insert:
+ bpf_mprog_read(peer, idx, &fp, &cp);
+ bpf_mprog_write(fp, cp, ntuple);
+ bpf_mprog_inc(peer);
+ *entry_new = peer;
+ return 0;
+}
+
+static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *dtuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_entry_shrink(peer, idx);
+ bpf_mprog_dec(peer);
+ bpf_mprog_mark_for_release(peer, dtuple);
+ *entry_new = peer;
+ return 0;
+}
+
+/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
+ * program/link that needs to be replaced, inserted or deleted for
+ * each "rule" independently. If all rules agree on that position
+ * or existing element, then enact replacement, addition or deletion.
+ * If this is not the case, then the request cannot be satisfied and
+ * we bail out with an error.
+ */
+static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog))
+ return tuple->link == cp->link ? i : -EBUSY;
+ }
+ return -ENOENT;
+}
+
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ } else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_REPLACE)
+ ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
+ else
+ ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_cp *cp;
+ struct bpf_mprog_fp *fp;
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ link = cp->link;
+ /* The deletion request can either be without filled tuple in which
+ * case it gets populated here based on idx, or with filled tuple
+ * where the only thing we end up doing is the WARN_ON_ONCE() assert.
+ * If we hit a BPF link at the given index, it must not be removed
+ * from opts path.
+ */
+ if (link && !tuple->link)
+ return -EBUSY;
+ WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
+ WARN_ON_ONCE(tuple->link && tuple->link != link);
+ tuple->prog = prog;
+ tuple->link = link;
+ return 0;
+}
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, dtuple = {
+ .prog = prog,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (flags & BPF_F_REPLACE)
+ return -EINVAL;
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (!bpf_mprog_total(entry))
+ return -ENOENT;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
+ prog ? prog->type :
+ BPF_PROG_TYPE_UNSPEC);
+ if (ret)
+ return ret;
+ if (dtuple.prog) {
+ tidx = bpf_mprog_pos_exact(entry, &dtuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ ret = bpf_mprog_fetch(entry, &dtuple, idx);
+ if (ret)
+ goto out;
+ ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry)
+{
+ u32 __user *uprog_flags, *ulink_flags;
+ u32 __user *uprog_id, *ulink_id;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *prog;
+ const u32 flags = 0;
+ u32 id, count = 0;
+ u64 revision = 1;
+ int i, ret = 0;
+
+ if (attr->query.query_flags || attr->query.attach_flags)
+ return -EINVAL;
+ if (entry) {
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ }
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
+ return -EFAULT;
+ uprog_id = u64_to_user_ptr(attr->query.prog_ids);
+ uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ ulink_id = u64_to_user_ptr(attr->query.link_ids);
+ ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
+ if (attr->query.count == 0 || !uprog_id || !count)
+ return 0;
+ if (attr->query.count < count) {
+ count = attr->query.count;
+ ret = -ENOSPC;
+ }
+ for (i = 0; i < bpf_mprog_max(); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ if (!prog)
+ break;
+ id = prog->aux->id;
+ if (copy_to_user(uprog_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (uprog_flags &&
+ copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ id = cp->link ? cp->link->id : 0;
+ if (ulink_id &&
+ copy_to_user(ulink_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (ulink_flags &&
+ copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (i + 1 == count)
+ break;
+ }
+ return ret;
+}
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 868cc2c43899..8e88201c98bf 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -11,8 +11,6 @@
struct bpf_netns_link {
struct bpf_link link;
- enum bpf_attach_type type;
- enum netns_bpf_attach_type netns_type;
/* We don't hold a ref to net in order to auto-detach the link
* when netns is going away. Instead we rely on pernet
@@ -21,6 +19,7 @@ struct bpf_netns_link {
*/
struct net *net;
struct list_head node; /* node in list of links attached to net */
+ enum netns_bpf_attach_type netns_type;
};
/* Protects updates to netns_bpf */
@@ -216,7 +215,7 @@ static int bpf_netns_link_fill_info(const struct bpf_link *link,
mutex_unlock(&netns_bpf_mutex);
info->netns.netns_ino = inum;
- info->netns.attach_type = net_link->type;
+ info->netns.attach_type = link->attach_type;
return 0;
}
@@ -230,7 +229,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
"netns_ino:\t%u\n"
"attach_type:\t%u\n",
info.netns.netns_ino,
- info.netns.attach_type);
+ link->attach_type);
}
static const struct bpf_link_ops bpf_netns_link_ops = {
@@ -501,9 +500,8 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
goto out_put_net;
}
bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
- &bpf_netns_link_ops, prog);
+ &bpf_netns_link_ops, prog, type);
net_link->net = net;
- net_link->type = type;
net_link->netns_type = netns_type;
err = bpf_link_prime(&net_link->link, &link_primer);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..42ae8d595c2c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,8 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/netdev_lock.h>
+#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
* of all progs.
@@ -41,7 +43,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +58,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +73,227 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
- if (!ondev) {
+ /* When program is offloaded require presence of "true"
+ * bpf_offload_netdev, avoid the one created for !ondev case below.
+ */
+ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
err = -EINVAL;
- goto err_unlock;
+ goto err_free;
+ }
+ if (!ondev) {
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+ return -EINVAL;
+
+ /* Frags are allowed only if program is dev-bound-only, but not
+ * if it is requesting bpf offload.
+ */
+ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+ !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,27 +379,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -343,22 +511,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -372,18 +524,19 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&offmap->map, attr);
-
rtnl_lock();
- down_write(&bpf_devs_lock);
offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
err = bpf_dev_offload_check(offmap->netdev);
if (err)
- goto err_unlock;
+ goto err_unlock_rtnl;
+
+ netdev_lock_ops(offmap->netdev);
+ down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offmap->netdev);
if (!ondev) {
@@ -397,26 +550,20 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
list_add_tail(&offmap->offloads, &ondev->maps);
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
rtnl_unlock();
return &offmap->map;
err_unlock:
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
+err_unlock_rtnl:
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -428,7 +575,13 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
+}
+
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
@@ -576,12 +729,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -595,32 +764,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
- up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -628,43 +776,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -673,18 +786,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -710,3 +811,68 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
+ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+ XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+core_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..632762b57299 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
for_each_possible_cpu(cpu) {
struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
- raw_spin_lock_init(&head->lock);
+ raw_res_spin_lock_init(&head->lock);
head->first = NULL;
}
- raw_spin_lock_init(&s->extralist.lock);
- s->extralist.first = NULL;
return 0;
}
@@ -31,63 +29,42 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
- head->first = node;
+ WRITE_ONCE(head->first, node);
}
-static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
- raw_spin_lock(&head->lock);
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
-}
-
-static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (raw_res_spin_lock(&head->lock))
return false;
-
- pcpu_freelist_push_node(&s->extralist, node);
- raw_spin_unlock(&s->extralist.lock);
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
return true;
}
-static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
+void __pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
{
- int cpu, orig_cpu;
+ struct pcpu_freelist_head *head;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
- struct pcpu_freelist_head *head;
+ if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node))
+ return;
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
+ while (true) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ if (cpu == raw_smp_processor_id())
+ continue;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_res_spin_lock(&head->lock))
+ continue;
pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
+ raw_res_spin_unlock(&head->lock);
return;
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
-
- /* cannot lock any per cpu lock, try extralist */
- if (cpu == orig_cpu &&
- pcpu_freelist_try_push_extra(s, node))
- return;
}
}
-void __pcpu_freelist_push(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (in_nmi())
- ___pcpu_freelist_push_nmi(s, node);
- else
- ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
-}
-
void pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
@@ -102,98 +79,49 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
- int i, cpu, pcpu_entries;
+ unsigned int cpu, cpu_idx, i, j, n, m;
- pcpu_entries = nr_elems / num_possible_cpus() + 1;
- i = 0;
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+ cpu_idx = 0;
for_each_possible_cpu(cpu) {
-again:
head = per_cpu_ptr(s->freelist, cpu);
- /* No locking required as this is not visible yet. */
- pcpu_freelist_push_node(head, buf);
- i++;
- buf += elem_size;
- if (i == nr_elems)
- break;
- if (i % pcpu_entries)
- goto again;
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
}
}
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
+ struct pcpu_freelist_node *node = NULL;
struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
- raw_spin_lock(&head->lock);
+ if (!READ_ONCE(head->first))
+ continue;
+ if (raw_res_spin_lock(&head->lock))
+ continue;
node = head->first;
if (node) {
- head->first = node->next;
- raw_spin_unlock(&head->lock);
+ WRITE_ONCE(head->first, node->next);
+ raw_res_spin_unlock(&head->lock);
return node;
}
- raw_spin_unlock(&head->lock);
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
+ raw_res_spin_unlock(&head->lock);
}
-
- /* per cpu lists are all empty, try extralist */
- raw_spin_lock(&s->extralist.lock);
- node = s->extralist.first;
- if (node)
- s->extralist.first = node->next;
- raw_spin_unlock(&s->extralist.lock);
- return node;
-}
-
-static struct pcpu_freelist_node *
-___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
-{
- struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
-
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- node = head->first;
- if (node) {
- head->first = node->next;
- raw_spin_unlock(&head->lock);
- return node;
- }
- raw_spin_unlock(&head->lock);
- }
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
- }
-
- /* cannot pop from per cpu lists, try extralist */
- if (!raw_spin_trylock(&s->extralist.lock))
- return NULL;
- node = s->extralist.first;
- if (node)
- s->extralist.first = node->next;
- raw_spin_unlock(&s->extralist.lock);
return node;
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
- if (in_nmi())
- return ___pcpu_freelist_pop_nmi(s);
return ___pcpu_freelist_pop(s);
}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..914798b74967 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -5,15 +5,15 @@
#define __PERCPU_FREELIST_H__
#include <linux/spinlock.h>
#include <linux/percpu.h>
+#include <asm/rqspinlock.h>
struct pcpu_freelist_head {
struct pcpu_freelist_node *first;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
- struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..aef7b0bc96d6 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-config USERMODE_DRIVER
- bool
- default n
-
menuconfig BPF_PRELOAD
bool "Preload BPF file system with kernel specific program and map iterators"
depends on BPF
@@ -10,7 +6,6 @@ menuconfig BPF_PRELOAD
# The dependency on !COMPILE_TEST prevents it from being enabled
# in allmodconfig or allyesconfig configurations
depends on !COMPILE_TEST
- select USERMODE_DRIVER
help
This builds kernel module with several embedded BPF programs that are
pinned into BPF FS mount point as human readable files that are
@@ -18,10 +13,9 @@ menuconfig BPF_PRELOAD
if BPF_PRELOAD
config BPF_PRELOAD_UMD
- tristate "bpf_preload kernel module with user mode driver"
- depends on CC_CAN_LINK
- depends on m || CC_CAN_LINK_STATIC
+ tristate "bpf_preload kernel module"
default m
help
- This builds bpf_preload kernel module with embedded user mode driver.
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1400ac58178e..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,42 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
- DESTDIR=$(LIBBPF_DESTDIR) prefix= \
- $(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
- $(call msg,MKDIR,$@)
- $(Q)mkdir -p $@
-
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(LIBBPF_INCLUDE) -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
-bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
-
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+LIBBPF_INCLUDE = $(srctree)/tools/lib
obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
#ifndef _BPF_PRELOAD_H
#define _BPF_PRELOAD_H
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
struct bpf_preload_ops {
- struct umd_info info;
int (*preload)(struct bpf_preload_info *);
- int (*finish)(void);
struct module *owner;
};
extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..774e5a538811 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,93 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
#include "bpf_preload.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
-static struct bpf_preload_ops umd_ops = {
- .info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
.preload = preload,
- .finish = finish,
.owner = THIS_MODULE,
};
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
{
- int magic = BPF_PRELOAD_START;
- loff_t pos = 0;
- int i, err;
- ssize_t n;
+ int err;
- err = fork_usermode_driver(&umd_ops.info);
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
if (err)
- return err;
-
- /* send the start magic to let UMD proceed with loading BPF progs */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- /* receive bpf_link IDs and names from UMD */
- pos = 0;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- n = kernel_read(umd_ops.info.pipe_from_umh,
- &obj[i], sizeof(*obj), &pos);
- if (n != sizeof(*obj))
- return -EPIPE;
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
}
- return 0;
-}
-
-static int finish(void)
-{
- int magic = BPF_PRELOAD_END;
- struct pid *tgid;
- loff_t pos = 0;
- ssize_t n;
-
- /* send the last magic to UMD. It will do a normal exit. */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- tgid = umd_ops.info.tgid;
- if (tgid) {
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
}
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
return 0;
+out:
+ free_links_and_skel();
+ return err;
}
-static int __init load_umd(void)
+static int __init load(void)
{
int err;
- err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
- &bpf_preload_umd_end - &bpf_preload_umd_start);
+ err = load_skel();
if (err)
return err;
- bpf_preload_ops = &umd_ops;
+ bpf_preload_ops = &ops;
return err;
}
-static void __exit fini_umd(void)
+static void __exit fini(void)
{
- struct pid *tgid;
-
bpf_preload_ops = NULL;
-
- /* kill UMD in case it's still there due to earlier error */
- tgid = umd_ops.info.tgid;
- if (tgid) {
- kill_pid(tgid, SIGKILL, 1);
-
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
- }
- umd_unload_blob(&umd_ops.info);
+ free_links_and_skel();
}
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
+MODULE_IMPORT_NS("BPF_INTERNAL");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpf_preload_umd_start
-bpf_preload_umd_start:
- .incbin "kernel/bpf/preload/bpf_preload_umd"
- .global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index b8bd60511227..b83c2f5e9be1 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip
TOOLS_PATH := $(abspath ../../../../tools)
BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
BPFTOOL_OUTPUT := $(abs_out)/bpftool
-DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
@@ -35,20 +35,22 @@ endif
.PHONY: all clean
-all: iterators.skel.h
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
- $(Q)$(BPFTOOL) gen skeleton $< > $@
-
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
-$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
- $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
@@ -61,9 +63,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU
OUTPUT=$(abspath $(dir $@))/ prefix= \
DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
-$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
- $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
- OUTPUT=$(BPFTOOL_OUTPUT)/ \
- LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
- LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
- prefix= DESTDIR=$(abs_out)/ install-bin
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
index 7fd6d39a9ad2..98e7c90ea012 100644
--- a/kernel/bpf/preload/iterators/README
+++ b/kernel/bpf/preload/iterators/README
@@ -1,4 +1,7 @@
WARNING:
-If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h".
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
Make sure to have clang 10 installed.
See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
- char link_name[16];
- int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 03af863314ea..b78968b63fab 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -73,6 +73,8 @@ static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
return str + name_off;
}
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{
@@ -84,9 +86,12 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
return 0;
if (seq_num == 0)
- BPF_SEQ_PRINTF(seq, " id name max_entries\n");
+ BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n",
+ map->id, map->name, map->max_entries,
+ bpf_map_sum_elem_count(map));
- BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries);
return 0;
}
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 5d872a705470..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.skel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
-{
- struct bpf_preload_info obj = {};
- struct bpf_link_info info = {};
- __u32 info_len = sizeof(info);
- int err;
-
- err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
- if (err)
- return err;
- obj.link_id = info.id;
- if (strlen(link_name) >= sizeof(obj.link_name))
- return -E2BIG;
- strcpy(obj.link_name, link_name);
- if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
- return -EPIPE;
- return 0;
-}
-
-int main(int argc, char **argv)
-{
- struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
- struct iterators_bpf *skel;
- int err, magic;
- int debug_fd;
-
- debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
- if (debug_fd < 0)
- return 1;
- to_kernel = dup(1);
- close(1);
- dup(debug_fd);
- /* now stdin and stderr point to /dev/console */
-
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_START) {
- printf("bad start magic %d\n", magic);
- return 1;
- }
- setrlimit(RLIMIT_MEMLOCK, &rlim);
- /* libbpf opens BPF object and loads it into the kernel */
- skel = iterators_bpf__open_and_load();
- if (!skel) {
- /* iterators.skel.h is little endian.
- * libbpf doesn't support automatic little->big conversion
- * of BPF bytecode yet.
- * The program load will fail in such case.
- */
- printf("Failed load could be due to wrong endianness\n");
- return 1;
- }
- err = iterators_bpf__attach(skel);
- if (err)
- goto cleanup;
-
- /* send two bpf_link IDs with names to the kernel */
- err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
- if (err)
- goto cleanup;
- err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
- if (err)
- goto cleanup;
-
- /* The kernel will proceed with pinnging the links in bpffs.
- * UMD will wait on read from pipe.
- */
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_END) {
- printf("bad final magic %d\n", magic);
- err = -EINVAL;
- }
-cleanup:
- iterators_bpf__destroy(skel);
-
- return err != 0;
-}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..49b1d515a847
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+ static const char opts_data[] __attribute__((__aligned__(8))) = "\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\
+\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\
+\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\
+\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\
+\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\
+\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\
+\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\
+\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\
+\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\
+\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\
+\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\
+\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\
+\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\
+\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\
+\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\
+\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\
+\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\
+\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\
+\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\
+\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\
+\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\
+\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\
+\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\
+\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\
+\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\
+\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\
+\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\
+\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\
+\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\
+\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\
+\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\
+\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\
+\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\
+\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\
+\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\
+\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\
+\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\
+\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\
+\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\
+\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\
+\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\
+\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\
+\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
+\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\
+\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\
+\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\
+\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\
+\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\
+\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\
+\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\
+\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\
+\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\
+\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\
+\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\
+\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\
+\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\
+\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\
+\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\
+\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\
+\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\
+\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\
+\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\
+\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\
+\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\
+\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\
+\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\
+\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\
+\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\
+\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\
+\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\
+\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\
+\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\
+\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\
+\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\
+\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\
+\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\
+\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\
+\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\
+\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\
+\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\
+\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\
+\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\
+\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\
+\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\
+\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\
+\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\
+\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\
+\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\
+\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\
+\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\
+\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\
+\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\
+\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\
+\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\
+\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\
+\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\
+\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\
+\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\
+\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\
+\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\
+\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\
+\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\
+\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\
+\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\
+\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\
+\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\
+\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\
+\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\
+\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\
+\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\
+\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\
+\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\
+\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\
+\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\
+\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\
+\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\
+\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ static const char opts_insn[] __attribute__((__aligned__(8))) = "\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\
+\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\
+\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\
+\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\
+\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\
+\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\
+\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\
+\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\
+\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\
+\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\
+\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\
+\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\
+\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\
+\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\
+\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\
+\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\
+\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\
+\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\
+\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\
+\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\
+\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\
+\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\
+\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\
+\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\
+\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\
+\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\
+\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\
+\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\
+\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\
+\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\
+\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\
+\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\
+\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0";
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = sizeof(opts_data) - 1;
+ opts.data = (void *)opts_data;
+ opts.insns_sz = sizeof(opts_insn) - 1;
+ opts.insns = (void *)opts_insn;
+
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
new file mode 100644
index 000000000000..5b98ab02025e
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6208;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\
+\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\
+\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\
+\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\
+\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\
+\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\
+\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\
+\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\
+\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\
+\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\
+\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\
+\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\
+\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\
+\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\
+\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\
+\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\
+\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\
+\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\
+\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\
+\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\
+\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\
+\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\
+\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\
+\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\
+\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\
+\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\
+\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\
+\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\
+\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\
+\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\
+\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\
+\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\
+\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\
+\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\
+\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\
+\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\
+\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\
+\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\
+\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\
+\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\
+\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\
+\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\
+\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\
+\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\
+\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\
+\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\
+\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\
+\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\
+\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\
+\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\
+\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\
+\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\
+\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\
+\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\
+\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\
+\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\
+\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\
+\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\
+\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\
+\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\
+\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\
+\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\
+\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\
+\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\
+\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\
+\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\
+\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\
+\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\
+\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\
+\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\
+\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\
+\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\
+\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\
+\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\
+\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\
+\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\
+\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\
+\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\
+\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\
+\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\
+\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\
+\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\
+\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\
+\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\
+\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\
+\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\
+\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\
+\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\
+\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\
+\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\
+\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\
+\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\
+\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\
+\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\
+\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\
+\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\
+\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\
+\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\
+\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\
+\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\
+\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\
+\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\
+\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\
+\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\
+\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\
+\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\
+\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\
+\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\
+\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\
+\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\
+\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\
+\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\
+\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\
+\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\
+\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\
+\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2456;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\
+\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\
+\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\
+\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\
+\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\
+\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\
+\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\
+\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\
+\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\
+\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\
+\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\
+\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\
+\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\
+\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\
+\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\
+\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\
+\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\
+\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\
+\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\
+\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\
+\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\
+\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\
+\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\
+\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
- struct bpf_object_skeleton *skeleton;
- struct bpf_object *obj;
- struct {
- struct bpf_map *rodata;
- } maps;
- struct {
- struct bpf_program *dump_bpf_map;
- struct bpf_program *dump_bpf_prog;
- } progs;
- struct {
- struct bpf_link *dump_bpf_map;
- struct bpf_link *dump_bpf_prog;
- } links;
- struct iterators_bpf__rodata {
- char dump_bpf_map____fmt[35];
- char dump_bpf_map____fmt_1[14];
- char dump_bpf_prog____fmt[32];
- char dump_bpf_prog____fmt_2[17];
- } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
- if (!obj)
- return;
- if (obj->skeleton)
- bpf_object__destroy_skeleton(obj->skeleton);
- free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
- struct iterators_bpf *obj;
-
- obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
- if (!obj)
- return NULL;
- if (iterators_bpf__create_skeleton(obj))
- goto err;
- if (bpf_object__open_skeleton(obj->skeleton, opts))
- goto err;
-
- return obj;
-err:
- iterators_bpf__destroy(obj);
- return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
- return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
- return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
- struct iterators_bpf *obj;
-
- obj = iterators_bpf__open();
- if (!obj)
- return NULL;
- if (iterators_bpf__load(obj)) {
- iterators_bpf__destroy(obj);
- return NULL;
- }
- return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
- return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
- return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
- struct bpf_object_skeleton *s;
-
- s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
- if (!s)
- return -1;
- obj->skeleton = s;
-
- s->sz = sizeof(*s);
- s->name = "iterators_bpf";
- s->obj = &obj->obj;
-
- /* maps */
- s->map_cnt = 1;
- s->map_skel_sz = sizeof(*s->maps);
- s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
- if (!s->maps)
- goto err;
-
- s->maps[0].name = "iterator.rodata";
- s->maps[0].map = &obj->maps.rodata;
- s->maps[0].mmaped = (void **)&obj->rodata;
-
- /* programs */
- s->prog_cnt = 2;
- s->prog_skel_sz = sizeof(*s->progs);
- s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
- if (!s->progs)
- goto err;
-
- s->progs[0].name = "dump_bpf_map";
- s->progs[0].prog = &obj->progs.dump_bpf_map;
- s->progs[0].link = &obj->links.dump_bpf_map;
-
- s->progs[1].name = "dump_bpf_prog";
- s->progs[1].prog = &obj->progs.dump_bpf_prog;
- s->progs[1].link = &obj->links.dump_bpf_prog;
-
- s->data_sz = 7176;
- s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
- return 0;
-err:
- bpf_object__destroy_skeleton(s);
- return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
index 53a73c841c13..85d8fcb56fb7 100644
--- a/kernel/bpf/prog_iter.c
+++ b/kernel/bpf/prog_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_prog_seq_ops = {
.show = bpf_prog_seq_show,
};
-BTF_ID_LIST(btf_bpf_prog_id)
-BTF_ID(struct, bpf_prog)
+BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog)
static const struct bpf_iter_seq_info bpf_prog_seq_info = {
.seq_ops = &bpf_prog_seq_ops,
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f9c734aaa990..9a5f94371e50 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,15 +7,16 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/capability.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
+#include <asm/rqspinlock.h>
#define QUEUE_STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_queue_stack {
struct bpf_map map;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
u32 head, tail;
u32 size; /* max_entries + 1 */
@@ -45,9 +46,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!bpf_capable())
- return -EPERM;
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
@@ -77,13 +75,11 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
if (!qs)
return ERR_PTR(-ENOMEM);
- memset(qs, 0, sizeof(*qs));
-
bpf_map_init_from_attr(&qs->map, attr);
qs->size = size;
- raw_spin_lock_init(&qs->lock);
+ raw_res_spin_lock_init(&qs->lock);
return &qs->map;
}
@@ -96,14 +92,15 @@ static void queue_stack_map_free(struct bpf_map *map)
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
int err = 0;
void *ptr;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -120,12 +117,12 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
}
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -133,7 +130,8 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -152,37 +150,37 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
qs->head = index;
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -198,7 +196,8 @@ static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
if (queue_stack_map_is_full(qs)) {
if (!replace) {
@@ -217,7 +216,7 @@ static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
qs->head = 0;
out:
- raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags);
return err;
}
@@ -228,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -247,7 +246,15 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int queue_map_btf_id;
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -260,11 +267,10 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &queue_map_btf_id,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
-static int stack_map_btf_id;
const struct bpf_map_ops stack_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -277,6 +283,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &stack_map_btf_id,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
new file mode 100644
index 000000000000..99c63d982c5d
--- /dev/null
+++ b/kernel/bpf/range_tree.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/interval_tree_generic.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include "range_tree.h"
+
+/*
+ * struct range_tree is a data structure used to allocate contiguous memory
+ * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
+ * represented by struct range_node or 'rn' for short.
+ * rn->rn_rbnode links it into an interval tree while
+ * rn->rb_range_size links it into a second rbtree sorted by size of the range.
+ * __find_range() performs binary search and best fit algorithm to find the
+ * range less or equal requested size.
+ * range_tree_clear/set() clears or sets a range of bits in this bitmap. The
+ * adjacent ranges are merged or split at the same time.
+ *
+ * The split/merge logic is based/borrowed from XFS's xbitmap32 added
+ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
+ *
+ * The implementation relies on external lock to protect rbtree-s.
+ * The alloc/free of range_node-s is done via kmalloc_nolock().
+ *
+ * bpf arena is using range_tree to represent unallocated slots.
+ * At init time:
+ * range_tree_set(rt, 0, max);
+ * Then:
+ * start = range_tree_find(rt, len);
+ * if (start >= 0)
+ * range_tree_clear(rt, start, len);
+ * to find free range and mark slots as allocated and later:
+ * range_tree_set(rt, start, len);
+ * to mark as unallocated after use.
+ */
+struct range_node {
+ struct rb_node rn_rbnode;
+ struct rb_node rb_range_size;
+ u32 rn_start;
+ u32 rn_last; /* inclusive */
+ u32 __rn_subtree_last;
+};
+
+static struct range_node *rb_to_range_node(struct rb_node *rb)
+{
+ return rb_entry(rb, struct range_node, rb_range_size);
+}
+
+static u32 rn_size(struct range_node *rn)
+{
+ return rn->rn_last - rn->rn_start + 1;
+}
+
+/* Find range that fits best to requested size */
+static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
+{
+ struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
+ struct range_node *best = NULL;
+
+ while (rb) {
+ struct range_node *rn = rb_to_range_node(rb);
+
+ if (len <= rn_size(rn)) {
+ best = rn;
+ rb = rb->rb_right;
+ } else {
+ rb = rb->rb_left;
+ }
+ }
+
+ return best;
+}
+
+s64 range_tree_find(struct range_tree *rt, u32 len)
+{
+ struct range_node *rn;
+
+ rn = __find_range(rt, len);
+ if (!rn)
+ return -ENOENT;
+ return rn->rn_start;
+}
+
+/* Insert the range into rbtree sorted by the range size */
+static inline void __range_size_insert(struct range_node *rn,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
+ u64 size = rn_size(rn);
+ bool leftmost = true;
+
+ while (*link) {
+ rb = *link;
+ if (size > rn_size(rb_to_range_node(rb))) {
+ link = &rb->rb_left;
+ } else {
+ link = &rb->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&rn->rb_range_size, rb, link);
+ rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
+}
+
+#define START(node) ((node)->rn_start)
+#define LAST(node) ((node)->rn_last)
+
+INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
+ __rn_subtree_last, START, LAST,
+ static inline __maybe_unused,
+ __range_it)
+
+static inline __maybe_unused void
+range_it_insert(struct range_node *rn, struct range_tree *rt)
+{
+ __range_size_insert(rn, &rt->range_size_root);
+ __range_it_insert(rn, &rt->it_root);
+}
+
+static inline __maybe_unused void
+range_it_remove(struct range_node *rn, struct range_tree *rt)
+{
+ rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
+ RB_CLEAR_NODE(&rn->rb_range_size);
+ __range_it_remove(rn, &rt->it_root);
+}
+
+static inline __maybe_unused struct range_node *
+range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
+{
+ return __range_it_iter_first(&rt->it_root, start, last);
+}
+
+/* Clear the range in this range tree */
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *new_rn;
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, start, last))) {
+ if (rn->rn_start < start && rn->rn_last > last) {
+ u32 old_last = rn->rn_last;
+
+ /* Overlaps with the entire clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+
+ /* Add a range */
+ new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ if (!new_rn)
+ return -ENOMEM;
+ new_rn->rn_start = last + 1;
+ new_rn->rn_last = old_last;
+ range_it_insert(new_rn, rt);
+ } else if (rn->rn_start < start) {
+ /* Overlaps with the left side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+ } else if (rn->rn_last > last) {
+ /* Overlaps with the right side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_start = last + 1;
+ range_it_insert(rn, rt);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ range_it_remove(rn, rt);
+ kfree_nolock(rn);
+ }
+ }
+ return 0;
+}
+
+/* Is the whole range set ? */
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *left;
+
+ /* Is this whole range set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+ return -ESRCH;
+}
+
+/* Set the range in this range tree */
+int range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *right;
+ struct range_node *left;
+ int err;
+
+ /* Is this whole range already set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ err = range_tree_clear(rt, start, len);
+ if (err)
+ return err;
+
+ /* Do we have a left-adjacent range ? */
+ left = range_it_iter_first(rt, start - 1, start - 1);
+ if (left && left->rn_last + 1 != start)
+ return -EFAULT;
+
+ /* Do we have a right-adjacent range ? */
+ right = range_it_iter_first(rt, last + 1, last + 1);
+ if (right && right->rn_start != last + 1)
+ return -EFAULT;
+
+ if (left && right) {
+ /* Combine left and right adjacent ranges */
+ range_it_remove(left, rt);
+ range_it_remove(right, rt);
+ left->rn_last = right->rn_last;
+ range_it_insert(left, rt);
+ kfree_nolock(right);
+ } else if (left) {
+ /* Combine with the left range */
+ range_it_remove(left, rt);
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ } else if (right) {
+ /* Combine with the right range */
+ range_it_remove(right, rt);
+ right->rn_start = start;
+ range_it_insert(right, rt);
+ } else {
+ left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ if (!left)
+ return -ENOMEM;
+ left->rn_start = start;
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ }
+ return 0;
+}
+
+void range_tree_destroy(struct range_tree *rt)
+{
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, 0, -1U))) {
+ range_it_remove(rn, rt);
+ kfree_nolock(rn);
+ }
+}
+
+void range_tree_init(struct range_tree *rt)
+{
+ rt->it_root = RB_ROOT_CACHED;
+ rt->range_size_root = RB_ROOT_CACHED;
+}
diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h
new file mode 100644
index 000000000000..ff0b9110eb71
--- /dev/null
+++ b/kernel/bpf/range_tree.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _RANGE_TREE_H
+#define _RANGE_TREE_H 1
+
+struct range_tree {
+ /* root of interval tree */
+ struct rb_root_cached it_root;
+ /* root of rbtree of interval sizes */
+ struct rb_root_cached range_size_root;
+};
+
+void range_tree_init(struct range_tree *rt);
+void range_tree_destroy(struct range_tree *rt);
+
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
+int range_tree_set(struct range_tree *rt, u32 start, u32 len);
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
+s64 range_tree_find(struct range_tree *rt, u32 len);
+
+#endif
diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c
new file mode 100644
index 000000000000..aa822c9fcfde
--- /dev/null
+++ b/kernel/bpf/relo_core.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/relo_core.c"
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 556a769b5b80..49b8e5a0c6b4 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -6,6 +6,7 @@
#include <linux/err.h>
#include <linux/sock_diag.h>
#include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
struct reuseport_array {
struct bpf_map map;
@@ -20,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
@@ -61,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
@@ -143,7 +141,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
@@ -153,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
@@ -313,7 +308,7 @@ put_file_unlock:
spin_unlock_bh(&reuseport_lock);
put_file:
- fput(socket->file);
+ sockfd_put(socket);
return err;
}
@@ -337,7 +332,14 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int reuseport_array_map_btf_id;
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = reuseport_array_alloc_check,
@@ -346,6 +348,6 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
- .map_btf_name = "reuseport_array",
- .map_btf_id = &reuseport_array_map_btf_id,
+ .map_mem_usage = reuseport_array_mem_usage,
+ .map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 638d7fd7b375..f6a075ffac63 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -10,40 +10,71 @@
#include <linux/poll.h>
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
#define RINGBUF_PGOFF \
(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
/* consumer page and producer page */
#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
-/* Maximum size of ring buffer area is limited by 32-bit page offset within
- * record header, counted in pages. Reserve 8 bits for extensibility, and take
- * into account few extra pages for consumer/producer pages and
- * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
- * ring buffer.
- */
-#define RINGBUF_MAX_DATA_SZ \
- (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
-
struct bpf_ringbuf {
wait_queue_head_t waitq;
struct irq_work work;
u64 mask;
struct page **pages;
int nr_pages;
- spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ bool overwrite_mode;
+ rqspinlock_t spinlock ____cacheline_aligned_in_smp;
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel. Note that the pending counter is placed in the same
+ * page as the producer, so that it shares the same cache line.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
+ unsigned long pending_pos;
+ unsigned long overwrite_pos; /* position after the last overwritten record */
char data[] __aligned(PAGE_SIZE);
};
@@ -62,7 +93,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
__GFP_NOWARN | __GFP_ZERO;
- int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
struct page **pages, *page;
@@ -104,7 +135,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
}
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
- VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
if (rb) {
kmemleak_not_leak(pages);
rb->pages = pages;
@@ -115,7 +146,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -126,7 +157,18 @@ static void bpf_ringbuf_notify(struct irq_work *work)
wake_up_all(&rb->waitq);
}
-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and
+ * take into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts, the current maximum size would be:
+ *
+ * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+ *
+ * This gives 64GB limit, which seems plenty for single ring buffer. Now
+ * considering that the maximum value of data_sz is (4GB - 1), there
+ * will be no overflow, so just note the size limit in the comments.
+ */
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
{
struct bpf_ringbuf *rb;
@@ -134,44 +176,48 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
if (!rb)
return NULL;
- spin_lock_init(&rb->spinlock);
+ raw_res_spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
rb->mask = data_sz - 1;
rb->consumer_pos = 0;
rb->producer_pos = 0;
+ rb->pending_pos = 0;
+ rb->overwrite_mode = overwrite_mode;
return rb;
}
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
+ bool overwrite_mode = false;
struct bpf_ringbuf_map *rb_map;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_RB_OVERWRITE) {
+ if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
+ return ERR_PTR(-EINVAL);
+ overwrite_mode = true;
+ }
+
if (attr->key_size || attr->value_size ||
!is_power_of_2(attr->max_entries) ||
!PAGE_ALIGNED(attr->max_entries))
return ERR_PTR(-EINVAL);
-#ifdef CONFIG_64BIT
- /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
- if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
- return ERR_PTR(-E2BIG);
-#endif
-
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&rb_map->map, attr);
- rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
if (!rb_map->rb) {
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
}
@@ -180,6 +226,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
{
+ irq_work_sync(&rb->work);
+
/* copy pages pointer and nr_pages to local variable, as we are going
* to unmap rb itself with vunmap() below
*/
@@ -189,7 +237,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
@@ -198,7 +246,7 @@ static void ringbuf_map_free(struct bpf_map *map)
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -206,13 +254,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
@@ -223,7 +271,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -233,25 +281,59 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
- } else {
- vma->vm_flags &= ~VM_MAYWRITE;
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+/*
+ * Return an estimate of the available data in the ring buffer.
+ * Note: the returned value can exceed the actual ring buffer size because the
+ * function is not synchronized with the producer. The producer acquires the
+ * ring buffer's spinlock, but this function does not.
+ */
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
- unsigned long cons_pos, prod_pos;
+ unsigned long cons_pos, prod_pos, over_pos;
cons_pos = smp_load_acquire(&rb->consumer_pos);
- prod_pos = smp_load_acquire(&rb->producer_pos);
- return prod_pos - cons_pos;
+
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = smp_load_acquire(&rb->overwrite_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - max(cons_pos, over_pos);
+ } else {
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+ }
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -263,19 +345,62 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
-static int ringbuf_map_btf_id;
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
- .map_btf_name = "bpf_ringbuf_map",
- .map_btf_id = &ringbuf_map_btf_id,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &ringbuf_map_btf_ids[0],
+};
+
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
};
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
@@ -302,39 +427,102 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
return (void*)((addr & PAGE_MASK) - off);
}
+static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
+ unsigned long new_prod_pos,
+ unsigned long cons_pos,
+ unsigned long pend_pos)
+{
+ /*
+ * No space if oldest not yet committed record until the newest
+ * record span more than (ringbuf_size - 1).
+ */
+ if (new_prod_pos - pend_pos > rb->mask)
+ return false;
+
+ /* Ok, we have space in overwrite mode */
+ if (unlikely(rb->overwrite_mode))
+ return true;
+
+ /*
+ * No space if producer position advances more than (ringbuf_size - 1)
+ * ahead of consumer position when not in overwrite mode.
+ */
+ if (new_prod_pos - cons_pos > rb->mask)
+ return false;
+
+ return true;
+}
+
+static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
+{
+ hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
+ return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
+}
+
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
- unsigned long cons_pos, prod_pos, new_prod_pos, flags;
- u32 len, pg_off;
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
struct bpf_ringbuf_hdr *hdr;
+ u32 len, pg_off, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
- if (len > rb->mask + 1)
+ if (len > ringbuf_total_data_sz(rb))
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
- if (in_nmi()) {
- if (!spin_trylock_irqsave(&rb->spinlock, flags))
- return NULL;
- } else {
- spin_lock_irqsave(&rb->spinlock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
+ return NULL;
+ pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
new_prod_pos = prod_pos + len;
- /* check for out of ringbuf space by ensuring producer position
- * doesn't advance more than (ringbuf_size - 1) ahead
- */
- if (new_prod_pos - cons_pos > rb->mask) {
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ while (pend_pos < prod_pos) {
+ hdr = (void *)rb->data + (pend_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ if (hdr_len & BPF_RINGBUF_BUSY_BIT)
+ break;
+ pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ rb->pending_pos = pend_pos;
+
+ if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
+ /*
+ * In overwrite mode, advance overwrite_pos when the ring buffer is full.
+ * The key points are to stay on record boundaries and consume enough records
+ * to fit the new one.
+ */
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = rb->overwrite_pos;
+ while (new_prod_pos - over_pos > rb->mask) {
+ hdr = (void *)rb->data + (over_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ /*
+ * The bpf_ringbuf_has_space() check above ensures we won’t
+ * step over a record currently being worked on by another
+ * producer.
+ */
+ over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ /*
+ * smp_store_release(&rb->producer_pos, new_prod_pos) at
+ * the end of the function ensures that when consumer sees
+ * the updated rb->producer_pos, it always sees the updated
+ * rb->overwrite_pos, so when consumer reads overwrite_pos
+ * after smp_load_acquire(r->producer_pos), the overwrite_pos
+ * will always be valid.
+ */
+ WRITE_ONCE(rb->overwrite_pos, over_pos);
+ }
+
hdr = (void *)rb->data + (prod_pos & rb->mask);
pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
hdr->len = size | BPF_RINGBUF_BUSY_BIT;
@@ -343,7 +531,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
/* pairs with consumer's smp_load_acquire() */
smp_store_release(&rb->producer_pos, new_prod_pos);
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return (void *)hdr + BPF_RINGBUF_HDR_SZ;
}
@@ -361,7 +549,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -404,7 +592,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -417,7 +605,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -459,11 +647,13 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
return smp_load_acquire(&rb->producer_pos);
+ case BPF_RB_OVERWRITE_POS:
+ return smp_load_acquire(&rb->overwrite_pos);
default:
return 0;
}
@@ -475,3 +665,215 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+ struct bpf_dynptr_kern *, ptr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *sample;
+ int err;
+
+ if (unlikely(flags)) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ err = bpf_dynptr_check_size(size);
+ if (err) {
+ bpf_dynptr_set_null(ptr);
+ return err;
+ }
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!sample) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+ .func = bpf_ringbuf_reserve_dynptr,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+ .func = bpf_ringbuf_submit_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+ .func = bpf_ringbuf_discard_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ atomic_set_release(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
new file mode 100644
index 000000000000..f7d0c8d4644e
--- /dev/null
+++ b/kernel/bpf/rqspinlock.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Resilient Queued Spin Lock
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ * Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#endif
+#include <trace/events/lock.h>
+#include <asm/rqspinlock.h>
+#include <linux/timekeeping.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include "../locking/qspinlock.h"
+#include "../locking/lock_events.h"
+#include "rqspinlock.h"
+#include "../locking/mcs_spinlock.h"
+#endif
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+struct rqspinlock_timeout {
+ u64 timeout_end;
+ u64 duration;
+ u64 cur;
+ u16 spin;
+};
+
+#define RES_TIMEOUT_VAL 2
+
+DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
+EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
+
+static bool is_lock_released(rqspinlock_t *lock, u32 mask)
+{
+ if (!(atomic_read_acquire(&lock->val) & (mask)))
+ return true;
+ return false;
+}
+
+static noinline int check_deadlock_AA(rqspinlock_t *lock)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int cnt = min(RES_NR_HELD, rqh->cnt);
+
+ /*
+ * Return an error if we hold the lock we are attempting to acquire.
+ * We'll iterate over max 32 locks; no need to do is_lock_released.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (rqh->locks[i] == lock)
+ return -EDEADLK;
+ }
+ return 0;
+}
+
+/*
+ * This focuses on the most common case of ABBA deadlocks (or ABBA involving
+ * more locks, which reduce to ABBA). This is not exhaustive, and we rely on
+ * timeouts as the final line of defense.
+ */
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
+ void *remote_lock;
+ int cpu;
+
+ /*
+ * Find the CPU holding the lock that we want to acquire. If there is a
+ * deadlock scenario, we will read a stable set on the remote CPU and
+ * find the target. This would be a constant time operation instead of
+ * O(NR_CPUS) if we could determine the owning CPU from a lock value, but
+ * that requires increasing the size of the lock word.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu);
+ int real_cnt = READ_ONCE(rqh_cpu->cnt);
+ int cnt = min(RES_NR_HELD, real_cnt);
+
+ /*
+ * Let's ensure to break out of this loop if the lock is available for
+ * us to potentially acquire.
+ */
+ if (is_lock_released(lock, mask))
+ return 0;
+
+ /*
+ * Skip ourselves, and CPUs whose count is less than 2, as they need at
+ * least one held lock and one acquisition attempt (reflected as top
+ * most entry) to participate in an ABBA deadlock.
+ *
+ * If cnt is more than RES_NR_HELD, it means the current lock being
+ * acquired won't appear in the table, and other locks in the table are
+ * already held, so we can't determine ABBA.
+ */
+ if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD)
+ continue;
+
+ /*
+ * Obtain the entry at the top, this corresponds to the lock the
+ * remote CPU is attempting to acquire in a deadlock situation,
+ * and would be one of the locks we hold on the current CPU.
+ */
+ remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]);
+ /*
+ * If it is NULL, we've raced and cannot determine a deadlock
+ * conclusively, skip this CPU.
+ */
+ if (!remote_lock)
+ continue;
+ /*
+ * Find if the lock we're attempting to acquire is held by this CPU.
+ * Don't consider the topmost entry, as that must be the latest lock
+ * being held or acquired. For a deadlock, the target CPU must also
+ * attempt to acquire a lock we hold, so for this search only 'cnt - 1'
+ * entries are important.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (READ_ONCE(rqh_cpu->locks[i]) != lock)
+ continue;
+ /*
+ * We found our lock as held on the remote CPU. Is the
+ * acquisition attempt on the remote CPU for a lock held
+ * by us? If so, we have a deadlock situation, and need
+ * to recover.
+ */
+ for (int i = 0; i < rqh_cnt - 1; i++) {
+ if (rqh->locks[i] == remote_lock)
+ return -EDEADLK;
+ }
+ /*
+ * Inconclusive; retry again later.
+ */
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ u64 prev = ts->cur;
+ u64 time;
+
+ if (!ts->timeout_end) {
+ if (check_deadlock_AA(lock))
+ return -EDEADLK;
+ ts->cur = ktime_get_mono_fast_ns();
+ ts->timeout_end = ts->cur + ts->duration;
+ return 0;
+ }
+
+ time = ktime_get_mono_fast_ns();
+ if (time > ts->timeout_end)
+ return -ETIMEDOUT;
+
+ /*
+ * A millisecond interval passed from last time? Trigger deadlock
+ * checks.
+ */
+ if (prev + NSEC_PER_MSEC < time) {
+ ts->cur = time;
+ return check_deadlock_ABBA(lock, mask);
+ }
+
+ return 0;
+}
+
+/*
+ * Do not amortize with spins when res_smp_cond_load_acquire is defined,
+ * as the macro does internal amortization for us.
+ */
+#ifndef res_smp_cond_load_acquire
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ \
+ if (!(ts).spin++) \
+ (ret) = check_timeout((lock), (mask), &(ts)); \
+ (ret); \
+ })
+#else
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ (ret) = check_timeout((lock), (mask), &(ts)); })
+#endif
+
+/*
+ * Initialize the 'spin' member.
+ * Set spin member to 0 to trigger AA/ABBA checks immediately.
+ */
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+
+/*
+ * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
+ * Duration is defined for each spin attempt, so set it here.
+ */
+#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
+
+/*
+ * Provide a test-and-set fallback for cases when queued spin lock support is
+ * absent from the architecture.
+ */
+int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
+{
+ struct rqspinlock_timeout ts;
+ int val, ret = 0;
+
+ RES_INIT_TIMEOUT(ts);
+ /*
+ * The fast path is not invoked for the TAS fallback, so we must grab
+ * the deadlock detection entry here.
+ */
+ grab_held_lock_entry(lock);
+
+ /*
+ * Since the waiting loop's time is dependent on the amount of
+ * contention, a short timeout unlike rqspinlock waiting loops
+ * isn't enough. Choose a second as the timeout value.
+ */
+ RES_RESET_TIMEOUT(ts, NSEC_PER_SEC);
+retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u))
+ goto out;
+ cpu_relax();
+ goto retry;
+ }
+
+ return 0;
+out:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
+
+#ifndef res_smp_cond_load_acquire
+#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
+#endif
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
+/**
+ * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * Return:
+ * * 0 - Lock was acquired successfully.
+ * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
+ * * -ETIMEDOUT - Lock acquisition failed because of timeout.
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ struct rqspinlock_timeout ts;
+ int idx, ret = 0;
+ u32 old, tail;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (resilient_virt_spin_lock_enabled())
+ return resilient_virt_spin_lock(lock);
+
+ RES_INIT_TIMEOUT(ts);
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+
+ goto queue;
+ }
+
+ /* Deadlock detection entry already held after failing fast path. */
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK) {
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ }
+
+ if (ret) {
+ /*
+ * We waited for the locked bit to go back to 0, as the pending
+ * waiter, but timed out. We need to clear the pending bit since
+ * we own it. Once a stuck owner has been recovered, the lock
+ * must be restored to a valid state, hence removing the pending
+ * bit is necessary.
+ *
+ * *,1,* -> *,0,*
+ */
+ clear_pending(lock);
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_entry;
+ }
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
+ return 0;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ /*
+ * Do not queue if we're a waiter and someone is attempting this lock on
+ * the same CPU. In case of NMIs, this prevents long timeouts where we
+ * interrupt the pending waiter, and the owner, that will eventually
+ * signal the head of our queue, both of which are logically but not
+ * physically part of the queue, hence outside the scope of the idx > 0
+ * check above for the trylock fallback.
+ */
+ if (check_deadlock_AA(lock)) {
+ ret = -EDEADLK;
+ goto err_release_entry;
+ }
+
+ lockevent_inc(lock_slowpath);
+ /* Deadlock detection entry already held after failing fast path. */
+ node = this_cpu_ptr(&rqnodes[0].mcs);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to attempting a trylock operation without using
+ * any MCS node. Unlike qspinlock which cannot fail, we have the
+ * option of failing the slow path, and under contention, such a
+ * trylock spinning will likely be treated unfairly due to lack of
+ * queueing, hence do not spin.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
+ lockevent_inc(lock_no_node);
+ if (!queued_spin_trylock(lock)) {
+ ret = -EDEADLK;
+ goto err_release_node;
+ }
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ int val;
+
+ prev = decode_tail(old, rqnodes);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ val = arch_mcs_spin_lock_contended(&node->locked);
+ if (val == RES_TIMEOUT_VAL) {
+ ret = -ETIMEDOUT;
+ goto waitq_timeout;
+ }
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is
+ * meant to span maximum allowed time per critical section, and we may
+ * have both the owner of the lock and the pending bit waiter ahead of
+ * us.
+ */
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
+ val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+ RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+
+ /* Disable queue destruction when we detect deadlocks. */
+ if (ret == -EDEADLK) {
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+ arch_mcs_spin_unlock_contended(&next->locked);
+ goto err_release_node;
+ }
+
+waitq_timeout:
+ if (ret) {
+ /*
+ * If the tail is still pointing to us, then we are the final waiter,
+ * and are responsible for resetting the tail back to 0. Otherwise, if
+ * the cmpxchg operation fails, we signal the next waiter to take exit
+ * and try the same. For a waiter with tail node 'n':
+ *
+ * n,*,* -> 0,*,*
+ *
+ * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is
+ * possible locked/pending bits keep changing and we see failures even
+ * when we remain the head of wait queue. However, eventually,
+ * pending bit owner will unset the pending bit, and new waiters
+ * will queue behind us. This will leave the lock owner in
+ * charge, and it will eventually either set locked bit to 0, or
+ * leave it as 1, allowing us to make progress.
+ *
+ * We terminate the whole wait queue for two reasons. Firstly,
+ * we eschew per-waiter timeouts with one applied at the head of
+ * the wait queue. This allows everyone to break out faster
+ * once we've seen the owner / pending waiter not responding for
+ * the timeout duration from the head. Secondly, it avoids
+ * complicated synchronization, because when not leaving in FIFO
+ * order, prev's next pointer needs to be fixed up etc.
+ */
+ if (!try_cmpxchg_tail(lock, tail, 0)) {
+ next = smp_cond_load_relaxed(&node->next, VAL);
+ WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
+ }
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+ }
+
+ /*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+
+release:
+ trace_contention_end(lock, 0);
+
+ /*
+ * release the node
+ */
+ __this_cpu_dec(rqnodes[0].mcs.count);
+ return ret;
+err_release_node:
+ trace_contention_end(lock, ret);
+ __this_cpu_dec(rqnodes[0].mcs.count);
+err_release_entry:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
+
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+__bpf_kfunc_start_defs();
+
+static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : "");
+ bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock);
+ bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt);
+ for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++)
+ bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]);
+ bpf_stream_dump_stack(ss);
+ }));
+}
+
+#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; })
+
+__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
+ BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
+
+ preempt_disable();
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false);
+ preempt_enable();
+ return ret;
+ }
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock)
+{
+ res_spin_unlock((rqspinlock_t *)lock);
+ preempt_enable();
+}
+
+__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ local_irq_save(flags);
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true);
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+ }
+ *ptr = flags;
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags = *ptr;
+
+ res_spin_unlock((rqspinlock_t *)lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(rqspinlock_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock)
+BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore)
+BTF_KFUNCS_END(rqspinlock_kfunc_ids)
+
+static const struct btf_kfunc_id_set rqspinlock_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &rqspinlock_kfunc_ids,
+};
+
+static __init int rqspinlock_register_kfuncs(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set);
+}
+late_initcall(rqspinlock_register_kfuncs);
diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h
new file mode 100644
index 000000000000..5d8cb1b1aab4
--- /dev/null
+++ b/kernel/bpf/rqspinlock.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Resilient Queued Spin Lock defines
+ *
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+#ifndef __LINUX_RQSPINLOCK_H
+#define __LINUX_RQSPINLOCK_H
+
+#include "../locking/qspinlock.h"
+
+/*
+ * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value
+ * @lock: Pointer to queued spinlock structure
+ * @tail: The tail to compare against
+ * @new_tail: The new queue tail code word
+ * Return: Bool to indicate whether the cmpxchg operation succeeded
+ *
+ * This is used by the head of the wait queue to clean up the queue.
+ * Provides relaxed ordering, since observers only rely on initialized
+ * state of the node which was made visible through the xchg_tail operation,
+ * i.e. through the smp_wmb preceding xchg_tail.
+ *
+ * We avoid using 16-bit cmpxchg, which is not available on all architectures.
+ */
+static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ /*
+ * Is the tail part we compare to already stale? Fail.
+ */
+ if ((old & _Q_TAIL_MASK) != tail)
+ return false;
+ /*
+ * Encode latest locked/pending state for new tail.
+ */
+ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return true;
+}
+
+#endif /* __LINUX_RQSPINLOCK_H */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 49e567209c6b..da3d328f5c15 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -28,7 +28,7 @@ struct bpf_stack_map {
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets;
- struct stack_map_bucket *buckets[];
+ struct stack_map_bucket *buckets[] __counted_by(n_buckets);
};
static inline bool stack_map_use_build_id(struct bpf_map *map)
@@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map)
sizeof(struct bpf_stack_build_id) : sizeof(u64);
}
+/**
+ * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
+ * @size: Size of the buffer/map value in bytes
+ * @elem_size: Size of each stack trace element
+ * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
+ *
+ * Return: Maximum number of stack trace entries that can be safely stored
+ */
+static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
+{
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 max_depth;
+ u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
+
+ max_depth = size / elem_size;
+ max_depth += skip;
+ if (max_depth > curr_sysctl_max_stack)
+ return curr_sysctl_max_stack;
+
+ return max_depth;
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u64 elem_size = sizeof(struct stack_map_bucket) +
@@ -74,9 +96,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -94,19 +113,20 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
} else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
- /* hash table size must be power of 2 */
- n_buckets = roundup_pow_of_two(attr->max_entries);
- if (!n_buckets)
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ if (attr->max_entries > 1UL << 31)
return ERR_PTR(-E2BIG);
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
- smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
@@ -126,13 +146,30 @@ free_smap:
return ERR_PTR(err);
}
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- u64 *ips, u32 trace_nr, bool user)
+ u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
@@ -143,30 +180,37 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
- vma = find_vma(current->mm, ips[i]);
- if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- - vma->vm_start;
+build_id_valid:
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{
#ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry;
@@ -177,9 +221,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry)
return NULL;
- entry->nr = init_nr +
- stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
- sysctl_perf_event_max_stack - init_nr, 0);
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
/* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -191,7 +234,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i;
/* copy data from the end to avoid using extra buffer */
- for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]);
}
@@ -208,27 +251,20 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 hash, id, trace_nr, trace_len, i, max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
+ trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -239,15 +275,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
- stack_map_get_build_id_offset(
- (struct bpf_stack_build_id *)new_bucket->data,
- ips, trace_nr, user);
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -284,19 +323,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 elem_size = stack_map_data_size(map);
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
+ u32 max_depth;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ false, false, 0);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -336,7 +375,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
int ret;
/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return bpf_get_stackid((unsigned long)(ctx->regs),
(unsigned long) map, flags, 0, 0);
@@ -352,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
return -EFAULT;
nr_kernel = count_kernel_ip(trace);
+ __u64 nr = trace->nr; /* save original */
if (kernel) {
- __u64 nr = trace->nr;
-
trace->nr = nr_kernel;
ret = __bpf_get_stackid(map, trace, flags);
-
- /* restore nr */
- trace->nr = nr;
} else { /* user */
u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -371,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
ret = __bpf_get_stackid(map, trace, flags);
}
+
+ /* restore nr */
+ trace->nr = nr;
+
return ret;
}
@@ -385,10 +424,11 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
- void *buf, u32 size, u64 flags)
+ void *buf, u32 size, u64 flags, bool may_fault)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ u32 trace_nr, copy_len, elem_size, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
@@ -402,8 +442,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;
- elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
- : sizeof(u64);
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;
@@ -411,35 +450,55 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (task && user && !user_mode(regs))
goto err_fault;
- num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
- else
- init_nr = sysctl_perf_event_max_stack - num_elem;
+ /* get_perf_callchain does not support crosstask user stack walking
+ * but returns an empty stack instead of NULL.
+ */
+ if (crosstask && user) {
+ err = -EOPNOTSUPP;
+ goto clear;
+ }
- if (trace_in)
+ max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
+
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
+ if (trace_in) {
trace = trace_in;
- else if (kernel && task)
- trace = get_callchain_entry_for_task(task, init_nr);
- else
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack,
- false, false);
- if (unlikely(!trace))
- goto err_fault;
+ trace->nr = min_t(u32, trace->nr, max_depth);
+ } else if (kernel && task) {
+ trace = get_callchain_entry_for_task(task, max_depth);
+ } else {
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ crosstask, false, 0);
+ }
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
goto err_fault;
+ }
- trace_nr -= skip;
- trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+ trace_nr = trace->nr - skip;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
- if (user && user_build_id)
- stack_map_get_build_id_offset(buf, ips, trace_nr, user);
- else
+
+ ips = trace->ip + skip;
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
@@ -455,7 +514,7 @@ clear:
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -468,22 +527,45 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
- u32, size, u64, flags)
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
{
struct pt_regs *regs;
- long res;
+ long res = -EINVAL;
if (!try_get_task_stack(task))
return -EFAULT;
regs = task_pt_regs(task);
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);
return res;
}
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
@@ -495,6 +577,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
@@ -505,8 +604,8 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
int err = -EINVAL;
__u64 nr_kernel;
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
@@ -526,7 +625,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;
trace->nr = nr_kernel;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
/* restore nr */
trace->nr = nr;
@@ -538,7 +637,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;
@@ -565,7 +664,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall */
-int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return bpf_stackmap_extract(map, key, value, true);
+}
+
+/* Called from syscall */
+int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *old_bucket;
@@ -582,7 +689,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
- old_bucket = xchg(&smap->buckets[id], bucket);
+ if (delete)
+ old_bucket = bucket;
+ else
+ old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
@@ -617,14 +727,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
@@ -653,16 +763,30 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
-static int stack_trace_map_btf_id;
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
.map_lookup_elem = stack_map_lookup_elem,
+ .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_stack_map",
- .map_btf_id = &stack_trace_map_btf_id,
+ .map_mem_usage = stack_map_mem_usage,
+ .map_btf_id = &stack_trace_map_btf_ids[0],
};
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
new file mode 100644
index 000000000000..0b6bc3f30335
--- /dev/null
+++ b/kernel/bpf/stream.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/gfp.h>
+#include <linux/memory.h>
+#include <linux/mutex.h>
+
+static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
+{
+ init_llist_node(&elem->node);
+ elem->total_len = len;
+ elem->consumed_len = 0;
+}
+
+static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
+{
+ const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
+ struct bpf_stream_elem *elem;
+ size_t alloc_size;
+
+ /*
+ * Length denotes the amount of data to be written as part of stream element,
+ * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
+ * accomodate, therefore deny allocations that won't fit into them.
+ */
+ if (len < 0 || len > max_len)
+ return NULL;
+
+ alloc_size = offsetof(struct bpf_stream_elem, str[len]);
+ elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
+ if (!elem)
+ return NULL;
+
+ bpf_stream_elem_init(elem, len);
+
+ return elem;
+}
+
+static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
+{
+ struct bpf_stream_elem *elem = NULL;
+
+ /*
+ * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
+ * log, elements will be popped at once and reversed to print the log.
+ */
+ elem = bpf_stream_elem_alloc(len);
+ if (!elem)
+ return -ENOMEM;
+
+ memcpy(elem->str, str, len);
+ llist_add(&elem->node, log);
+
+ return 0;
+}
+
+static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
+{
+ if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
+ return -ENOSPC;
+ if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
+ atomic_sub(len, &stream->capacity);
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
+{
+ int len = elem->total_len;
+
+ atomic_sub(len, &stream->capacity);
+}
+
+static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
+{
+ int ret = bpf_stream_consume_capacity(stream, len);
+
+ return ret ?: __bpf_stream_push_str(&stream->log, str, len);
+}
+
+static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
+{
+ if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
+ return NULL;
+ return &aux->stream[stream_id - 1];
+}
+
+static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
+{
+ kfree_nolock(elem);
+}
+
+static void bpf_stream_free_list(struct llist_node *list)
+{
+ struct bpf_stream_elem *elem, *tmp;
+
+ llist_for_each_entry_safe(elem, tmp, list, node)
+ bpf_stream_free_elem(elem);
+}
+
+static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
+{
+ return stream->backlog_head;
+}
+
+static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
+{
+ struct llist_node *node;
+
+ node = stream->backlog_head;
+ if (stream->backlog_head == stream->backlog_tail)
+ stream->backlog_head = stream->backlog_tail = NULL;
+ else
+ stream->backlog_head = node->next;
+ return node;
+}
+
+static void bpf_stream_backlog_fill(struct bpf_stream *stream)
+{
+ struct llist_node *head, *tail;
+
+ if (llist_empty(&stream->log))
+ return;
+ tail = llist_del_all(&stream->log);
+ if (!tail)
+ return;
+ head = llist_reverse_order(tail);
+
+ if (!stream->backlog_head) {
+ stream->backlog_head = head;
+ stream->backlog_tail = tail;
+ } else {
+ stream->backlog_tail->next = head;
+ stream->backlog_tail = tail;
+ }
+
+ return;
+}
+
+static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
+{
+ int rem = elem->total_len - elem->consumed_len;
+ int used = min(rem, *len);
+
+ elem->consumed_len += used;
+ *len -= used;
+
+ return elem->consumed_len == elem->total_len;
+}
+
+static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
+{
+ int rem_len = len, cons_len, ret = 0;
+ struct bpf_stream_elem *elem = NULL;
+ struct llist_node *node;
+
+ mutex_lock(&stream->lock);
+
+ while (rem_len) {
+ int pos = len - rem_len;
+ bool cont;
+
+ node = bpf_stream_backlog_peek(stream);
+ if (!node) {
+ bpf_stream_backlog_fill(stream);
+ node = bpf_stream_backlog_peek(stream);
+ }
+ if (!node)
+ break;
+ elem = container_of(node, typeof(*elem), node);
+
+ cons_len = elem->consumed_len;
+ cont = bpf_stream_consume_elem(elem, &rem_len) == false;
+
+ ret = copy_to_user(buf + pos, elem->str + cons_len,
+ elem->consumed_len - cons_len);
+ /* Restore in case of error. */
+ if (ret) {
+ ret = -EFAULT;
+ elem->consumed_len = cons_len;
+ break;
+ }
+
+ if (cont)
+ continue;
+ bpf_stream_backlog_pop(stream);
+ bpf_stream_release_capacity(stream, elem);
+ bpf_stream_free_elem(elem);
+ }
+
+ mutex_unlock(&stream->lock);
+ return ret ? ret : len - rem_len;
+}
+
+int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
+{
+ struct bpf_stream *stream;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -ENOENT;
+ return bpf_stream_read(stream, buf, len);
+}
+
+__bpf_kfunc_start_defs();
+
+/*
+ * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
+ * enum in headers.
+ */
+__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, void *aux__prog)
+{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
+ struct bpf_prog_aux *aux = aux__prog;
+ u32 fmt_size = strlen(fmt__str) + 1;
+ struct bpf_stream *stream;
+ u32 data_len = len__sz;
+ int ret, num_args;
+
+ stream = bpf_stream_get(stream_id, aux);
+ if (!stream)
+ return -ENOENT;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
+ if (ret < 0)
+ return ret;
+
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
+ /* Exclude NULL byte during push. */
+ ret = bpf_stream_push_str(stream, data.buf, ret);
+ bpf_bprintf_cleanup(&data);
+
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+/* Added kfunc to common_btf_ids */
+
+void bpf_prog_stream_init(struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ atomic_set(&prog->aux->stream[i].capacity, 0);
+ init_llist_head(&prog->aux->stream[i].log);
+ mutex_init(&prog->aux->stream[i].lock);
+ prog->aux->stream[i].backlog_head = NULL;
+ prog->aux->stream[i].backlog_tail = NULL;
+ }
+}
+
+void bpf_prog_stream_free(struct bpf_prog *prog)
+{
+ struct llist_node *list;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ list = llist_del_all(&prog->aux->stream[i].log);
+ bpf_stream_free_list(list);
+ bpf_stream_free_list(prog->aux->stream[i].backlog_head);
+ }
+}
+
+void bpf_stream_stage_init(struct bpf_stream_stage *ss)
+{
+ init_llist_head(&ss->log);
+ ss->len = 0;
+}
+
+void bpf_stream_stage_free(struct bpf_stream_stage *ss)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&ss->log);
+ bpf_stream_free_list(node);
+}
+
+int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
+{
+ struct bpf_bprintf_buffers *buf;
+ va_list args;
+ int ret;
+
+ if (bpf_try_get_buffers(&buf))
+ return -EBUSY;
+
+ va_start(args, fmt);
+ ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
+ va_end(args);
+ ss->len += ret;
+ /* Exclude NULL byte during push. */
+ ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
+ bpf_put_buffers();
+ return ret;
+}
+
+int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
+ enum bpf_stream_id stream_id)
+{
+ struct llist_node *list, *head, *tail;
+ struct bpf_stream *stream;
+ int ret;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -EINVAL;
+
+ ret = bpf_stream_consume_capacity(stream, ss->len);
+ if (ret)
+ return ret;
+
+ list = llist_del_all(&ss->log);
+ head = tail = list;
+
+ if (!list)
+ return 0;
+ while (llist_next(list)) {
+ tail = llist_next(list);
+ list = tail;
+ }
+ llist_add_batch(head, tail, &stream->log);
+ return 0;
+}
+
+struct dump_stack_ctx {
+ struct bpf_stream_stage *ss;
+ int err;
+};
+
+static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct dump_stack_ctx *ctxp = cookie;
+ const char *file = "", *line = "";
+ struct bpf_prog *prog;
+ int num, ret;
+
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (prog) {
+ ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
+ if (ret < 0)
+ goto end;
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n",
+ (void *)(long)ip, line, file, num);
+ return !ctxp->err;
+ }
+end:
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
+ return !ctxp->err;
+}
+
+int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
+{
+ struct dump_stack_ctx ctx = { .ss = ss };
+ int ret;
+
+ ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
+ raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
+ current->pid, current->comm);
+ if (ret)
+ return ret;
+ ret = bpf_stream_stage_printk(ss, "Call trace:\n");
+ if (ret)
+ return ret;
+ arch_bpf_stack_walk(dump_stack_cb, &ctx);
+ if (ctx.err)
+ return ctx.err;
+ return bpf_stream_stage_printk(ss, "\n");
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fa4505f9b611..4ff82144f885 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
+#include <crypto/sha2.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
@@ -29,9 +31,19 @@
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
+#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
+#include <linux/tracepoint.h>
+#include <linux/overflow.h>
+#include <linux/cookie.h>
+#include <linux/verification.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
+#include <net/tcx.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -44,6 +56,7 @@
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
@@ -102,37 +115,9 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
};
-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
-{
- const struct bpf_map_ops *ops;
- u32 type = attr->map_type;
- struct bpf_map *map;
- int err;
-
- if (type >= ARRAY_SIZE(bpf_map_types))
- return ERR_PTR(-EINVAL);
- type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
- ops = bpf_map_types[type];
- if (!ops)
- return ERR_PTR(-EINVAL);
-
- if (ops->map_alloc_check) {
- err = ops->map_alloc_check(attr);
- if (err)
- return ERR_PTR(err);
- }
- if (attr->map_ifindex)
- ops = &bpf_map_offload_ops;
- map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = ops;
- map->map_type = type;
- return map;
-}
-
static void bpf_map_write_active_inc(struct bpf_map *map)
{
atomic64_inc(&map->writecnt);
@@ -163,31 +148,119 @@ static u32 bpf_map_value_size(const struct bpf_map *map)
static void maybe_wait_bpf_programs(struct bpf_map *map)
{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
+ /* Wait for any running non-sleepable BPF programs to complete so that
+ * userspace, when we return to it, knows that all non-sleepable
+ * programs that could be running use the new map value. For sleepable
+ * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+ * for the completions of these programs, but considering the waiting
+ * time can be very long and userspace may think it will hang forever,
+ * so don't handle sleepable BPF programs now.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
+ synchronize_rcu_expedited();
+}
+
+static void unpin_uptr_kaddr(void *kaddr)
+{
+ if (kaddr)
+ unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+ const struct btf_field *field;
+ void **uptr_addr;
+ int i;
+
+ for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ unpin_uptr_kaddr(*uptr_addr);
+ }
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return;
+
+ __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+ const struct btf_field *field;
+ const struct btf_type *t;
+ unsigned long start, end;
+ struct page *page;
+ void **uptr_addr;
+ int i, err;
+
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return 0;
+
+ for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ start = *(unsigned long *)uptr_addr;
+ if (!start)
+ continue;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ /* t->size was checked for zero before */
+ if (check_add_overflow(start, t->size - 1, &end)) {
+ err = -EFAULT;
+ goto unpin_all;
+ }
+
+ /* The uptr's struct cannot span across two pages */
+ if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+ err = -EOPNOTSUPP;
+ goto unpin_all;
+ }
+
+ err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+ if (err != 1)
+ goto unpin_all;
+
+ if (PageHighMem(page)) {
+ err = -EOPNOTSUPP;
+ unpin_user_page(page);
+ goto unpin_all;
+ }
+
+ *uptr_addr = page_address(page) + offset_in_page(start);
+ }
+
+ return 0;
+
+unpin_all:
+ __bpf_obj_unpin_uptrs(rec, i, obj);
+ return err;
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_ARENA ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -201,15 +274,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
} else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
@@ -219,12 +288,16 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, flags);
- rcu_read_unlock();
+ err = bpf_obj_pin_uptrs(map->record, value);
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ if (err)
+ bpf_obj_unpin_uptrs(map->record, value);
+ }
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -235,7 +308,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -247,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
+ err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
@@ -285,7 +358,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -306,7 +378,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@@ -387,7 +459,7 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map)
{
unsigned long flags;
@@ -399,52 +471,95 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
if (!map->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&map_idr_lock, flags);
- else
- __acquire(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
idr_remove(&map_idr, map->id);
map->id = 0;
- if (do_idr_lock)
- spin_unlock_irqrestore(&map_idr_lock, flags);
- else
- __release(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(struct bpf_map *map)
{
- map->memcg = get_mem_cgroup_from_mm(current->mm);
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
- mem_cgroup_put(map->memcg);
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
+
+ return root_mem_cgroup;
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -452,12 +567,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -472,6 +589,327 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
+ unsigned long nr_pages, struct page **pages)
+{
+ unsigned long i, j;
+ struct page *pg;
+ int ret = 0;
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+#endif
+ for (i = 0; i < nr_pages; i++) {
+ pg = __bpf_alloc_page(nid);
+
+ if (pg) {
+ pages[i] = pg;
+ continue;
+ }
+ for (j = 0; j < i; j++)
+ free_pages_nolock(pages[j], 0);
+ ret = -ENOMEM;
+ break;
+ }
+
+#ifdef CONFIG_MEMCG
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+#endif
+ return ret;
+}
+
+
+static int btf_field_cmp(const void *a, const void *b)
+{
+ const struct btf_field *f1 = a, *f2 = b;
+
+ if (f1->offset < f2->offset)
+ return -1;
+ else if (f1->offset > f2->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ u32 field_mask)
+{
+ struct btf_field *field;
+
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & field_mask))
+ return NULL;
+ return field;
+}
+
+void btf_record_free(struct btf_record *rec)
+{
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ /* Nothing to release */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+ kfree(rec);
+}
+
+void bpf_map_free_record(struct bpf_map *map)
+{
+ btf_record_free(map->record);
+ map->record = NULL;
+}
+
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = struct_size(rec, fields, rec->cnt);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
+ return ERR_PTR(-ENOMEM);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ /* Nothing to acquire */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
+ }
+ new_rec->cnt++;
+ }
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
+}
+
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
+{
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
+ int size;
+
+ if (!a_has_fields && !b_has_fields)
+ return true;
+ if (a_has_fields != b_has_fields)
+ return false;
+ if (rec_a->cnt != rec_b->cnt)
+ return false;
+ size = struct_size(rec_a, fields, rec_a->cnt);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
+}
+
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
+
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ return;
+ bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
+void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
+ return;
+ bpf_task_work_cancel_and_free(obj + rec->task_work_off);
+}
+
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
+ void *xchgd_field;
+
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_WORKQUEUE:
+ bpf_wq_cancel_and_free(field_ptr);
+ break;
+ case BPF_TASK_WORK:
+ bpf_task_work_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
+ break;
+ case BPF_UPTR:
+ /* The caller ensured that no one is using the uptr */
+ unpin_uptr_kaddr(*(void **)field_ptr);
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+}
+
+static void bpf_map_free(struct bpf_map *map)
+{
+ struct btf_record *rec = map->record;
+ struct btf *btf = map->btf;
+
+ /* implementation dependent freeing. Disabling migration to simplify
+ * the free of values or special fields allocated from bpf memory
+ * allocator.
+ */
+ kfree(map->excl_prog_sha);
+ migrate_disable();
+ map->ops->map_free(map);
+ migrate_enable();
+
+ /* Delay freeing of btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ btf_record_free(rec);
+ /* Delay freeing of btf for maps, as map_free callback may need
+ * struct_meta info which will be freed with btf_put().
+ */
+ btf_put(btf);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
@@ -479,8 +917,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
security_bpf_map_free(map);
bpf_map_release_memcg(map);
- /* implementation dependent freeing */
- map->ops->map_free(map);
+ bpf_map_owner_free(map);
+ bpf_map_free(map);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -491,23 +929,45 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
}
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_dfl_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+ bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_map_free_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
-static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_put(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
- bpf_map_free_id(map, do_idr_lock);
- btf_put(map->btf);
- INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
- }
-}
+ bpf_map_free_id(map);
-void bpf_map_put(struct bpf_map *map)
-{
- __bpf_map_put(map, true);
+ WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ else if (READ_ONCE(map->free_after_rcu_gp))
+ call_rcu(&map->rcu, bpf_map_free_rcu_gp);
+ else
+ bpf_map_free_in_work(map);
+ }
}
EXPORT_SYMBOL_GPL(bpf_map_put);
@@ -530,7 +990,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
- fmode_t mode = f.file->f_mode;
+ fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
@@ -541,32 +1001,23 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
- unsigned long size;
-
- size = round_up(map->key_size + bpf_map_value_size(map), 8);
-
- return round_up(map->max_entries * size, PAGE_SIZE);
+ return map->ops->map_mem_usage(map);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- spin_lock(&array->aux->owner.lock);
- type = array->aux->owner.type;
- jited = array->aux->owner.jited;
- spin_unlock(&array->aux->owner.lock);
+ spin_lock(&map->owner_lock);
+ if (map->owner) {
+ type = map->owner->type;
+ jited = map->owner->jited;
}
+ spin_unlock(&map->owner_lock);
seq_printf(m,
"map_type:\t%u\n"
@@ -575,7 +1026,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"map_extra:\t%#llx\n"
- "memlock:\t%lu\n"
+ "memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
@@ -584,7 +1035,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->max_entries,
map->map_flags,
(unsigned long long)map->map_extra,
- bpf_map_memory_footprint(map),
+ bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@@ -638,10 +1089,9 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
- map_value_has_timer(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -663,24 +1113,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
- vma->vm_flags &= ~VM_MAYEXEC;
+ vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
if (!(vma->vm_flags & VM_WRITE))
- /* disallow re-mapping with PROT_WRITE */
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -694,6 +1153,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
return EPOLLERR;
}
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_get_unmapped_area)
+ return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+ return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
+#else
+ return addr;
+#endif
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -703,6 +1177,7 @@ const struct file_operations bpf_map_fops = {
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
.poll = bpf_map_poll,
+ .get_unmapped_area = bpf_get_unmapped_area,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -759,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
return src - orig_src;
}
+EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
int map_check_no_btf(const struct bpf_map *map,
const struct btf *btf,
@@ -768,8 +1244,8 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
- u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+ const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
@@ -790,49 +1266,121 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
- map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
- map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
+ BPF_TASK_WORK,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ ret = -EPERM;
+ goto free_map_tab;
+ }
+ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+ ret = -EACCES;
+ goto free_map_tab;
+ }
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_REFCOUNT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_UPTR:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
- map->timer_off = btf_find_timer(btf, value_type);
- if (map_value_has_timer(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EOPNOTSUPP;
- }
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
- if (map->ops->map_check_btf)
+ if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+ if (ret < 0)
+ goto free_map_tab;
+ }
return ret;
+free_map_tab:
+ bpf_map_free_record(map);
+ return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+static bool bpf_net_capable(void)
+{
+ return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr)
+static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
+ const struct bpf_map_ops *ops;
+ struct bpf_token *token = NULL;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 map_type = attr->map_type;
struct bpf_map *map;
+ bool token_flag;
int f_flags;
int err;
@@ -840,6 +1388,12 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
+ * to avoid per-map type checks tripping on unknown flag
+ */
+ token_flag = attr->map_flags & BPF_F_TOKEN_FD;
+ attr->map_flags &= ~BPF_F_TOKEN_FD;
+
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
@@ -849,6 +1403,7 @@ static int map_create(union bpf_attr *attr)
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_type != BPF_MAP_TYPE_ARENA &&
attr->map_extra != 0)
return -EINVAL;
@@ -862,21 +1417,122 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
+ map_type = attr->map_type;
+ if (map_type >= ARRAY_SIZE(bpf_map_types))
+ return -EINVAL;
+ map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[map_type];
+ if (!ops)
+ return -EINVAL;
+
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return err;
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return -EINVAL;
+
+ if (token_flag) {
+ token = bpf_token_get_from_fd(attr->map_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+
+ /* if current token doesn't grant map creation permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+ !bpf_token_allow_map_type(token, attr->map_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ err = -EPERM;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF map
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+ goto put_token;
+
+ /* check privileged map type permissions */
+ switch (map_type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ /* unprivileged */
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ case BPF_MAP_TYPE_LPM_TRIE:
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ case BPF_MAP_TYPE_STACK_TRACE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_STRUCT_OPS:
+ case BPF_MAP_TYPE_CPUMAP:
+ case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ if (!bpf_token_capable(token, CAP_BPF))
+ goto put_token;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ case BPF_MAP_TYPE_XSKMAP:
+ if (!bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ break;
+ default:
+ WARN(1, "unsupported map type %d", map_type);
+ goto put_token;
+ }
+
+ map = ops->map_alloc(attr);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto put_token;
+ }
+ map->ops = ops;
+ map->map_type = map_type;
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
if (err < 0)
goto free_map;
+ preempt_disable();
+ map->cookie = gen_cookie_next(&bpf_map_cookie);
+ preempt_enable();
+
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner_lock);
- map->spin_lock_off = -EINVAL;
- map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -900,7 +1556,7 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
if (attr->btf_value_type_id) {
- err = map_check_btf(map, btf, attr->btf_key_type_id,
+ err = map_check_btf(map, token, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
@@ -912,15 +1568,39 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_alloc(map);
- if (err)
+ if (attr->excl_prog_hash) {
+ bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
+
+ if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ err = -EINVAL;
+ goto free_map;
+ }
+
+ map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!map->excl_prog_sha) {
+ err = -ENOMEM;
+ goto free_map;
+ }
+
+ if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
+ err = -EFAULT;
+ goto free_map;
+ }
+ } else if (attr->excl_prog_hash_size) {
+ err = -EINVAL;
goto free_map;
+ }
+
+ err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
+ if (err)
+ goto free_map_sec;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
+ bpf_token_put(token);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -939,26 +1619,12 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
free_map:
- btf_put(map->btf);
- map->ops->map_free(map);
+ bpf_map_free(map);
+put_token:
+ bpf_token_put(token);
return err;
}
-/* if error is returned, fd is released.
- * On success caller should complete fd access with matching fdput()
- */
-struct bpf_map *__bpf_map_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_map_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -974,36 +1640,31 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
-
- bpf_map_inc(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc(map);
return map;
}
+EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL");
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- bpf_map_inc_with_uref(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc_with_uref(map);
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -1018,15 +1679,13 @@ static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
- spin_lock_bh(&map_idr_lock);
- map = __bpf_map_inc_not_zero(map, false);
- spin_unlock_bh(&map_idr_lock);
-
- return map;
+ lockdep_assert(rcu_read_lock_held());
+ return __bpf_map_inc_not_zero(map, false);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
-int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
return -ENOTSUPP;
}
@@ -1060,39 +1719,28 @@ static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
- if (attr->flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
- if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
- err = -EINVAL;
- goto err_put;
- }
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ if (err)
+ return err;
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
value_size = bpf_map_value_size(map);
@@ -1123,8 +1771,6 @@ free_value:
kvfree(value);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1135,17 +1781,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1155,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
- err = -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->flags, ~0);
+ if (err)
goto err_put;
- }
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -1168,43 +1810,37 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
value_size = bpf_map_value_size(map);
-
- err = -ENOMEM;
- value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
+ }
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
-
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
+ if (!err)
+ maybe_wait_bpf_programs(map);
-free_value:
kvfree(value);
free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
- int ufd = attr->map_fd;
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
struct bpf_map *map;
- struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1214,13 +1850,13 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1235,12 +1871,12 @@ static int map_delete_elem(union bpf_attr *attr)
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
+ if (!err)
+ maybe_wait_bpf_programs(map);
out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1251,30 +1887,24 @@ static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
} else {
key = NULL;
}
@@ -1284,7 +1914,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1306,8 +1936,6 @@ free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1324,7 +1952,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1332,6 +1960,9 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1342,7 +1973,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1352,36 +1983,31 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
kvfree(key);
+
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->batch.map_fd;
void *key, *value;
- struct fd f;
int err = 0;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
- return -EINVAL;
- }
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -1389,6 +2015,9 @@ int generic_map_update_batch(struct bpf_map *map,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1399,7 +2028,6 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
- f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1407,11 +2035,12 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
@@ -1419,12 +2048,10 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
- fdput(f);
+
return err;
}
-#define MAP_LOOKUP_RETRIES 3
-
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1434,15 +2061,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
- int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
+ int err;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
- return -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -1481,14 +2105,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
- if (err == -ENOENT) {
- if (retry) {
- retry--;
- continue;
- }
- err = -EINTR;
- break;
- }
+ if (err == -ENOENT)
+ goto next_key;
if (err)
goto free_buf;
@@ -1503,12 +2121,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
goto free_buf;
}
+ cp++;
+next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
- retry = MAP_LOOKUP_RETRIES;
- cp++;
+ cond_resched();
}
if (err == -EFAULT)
@@ -1530,11 +2149,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
@@ -1543,7 +2160,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1562,7 +2179,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1587,8 +2204,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1613,7 +2231,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1621,23 +2238,22 @@ err_put:
static int map_freeze(const union bpf_attr *attr)
{
- int err = 0, ufd = attr->map_fd;
+ int err = 0;
struct bpf_map *map;
- struct fd f;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
- map_value_has_timer(map)) {
- fdput(f);
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
- }
+
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
+ return -EPERM;
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
@@ -1648,15 +2264,10 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!bpf_capable()) {
- err = -EPERM;
- goto err_put;
- }
WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
- fdput(f);
return err;
}
@@ -1682,7 +2293,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -1710,7 +2321,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_hardirq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -1739,7 +2350,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
unsigned long flags;
@@ -1751,18 +2362,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&prog_idr_lock, flags);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_irqrestore(&prog_idr_lock, flags);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -1772,7 +2375,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
- security_bpf_prog_free(aux);
+ security_bpf_prog_free(aux->prog);
bpf_prog_free(aux->prog);
}
@@ -1780,14 +2383,16 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
+ kfree(prog->aux->ctx_arg_info);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
if (deferred) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -1805,18 +2410,16 @@ static void bpf_prog_put_deferred(struct work_struct *work)
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
__bpf_prog_put_noref(prog, true);
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void __bpf_prog_put(struct bpf_prog *prog)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
-
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
} else {
@@ -1827,7 +2430,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -1845,6 +2448,20 @@ struct bpf_prog_kstats {
u64 misses;
};
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ if (unlikely(!prog->stats))
+ return;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
@@ -1858,11 +2475,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
+ start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
@@ -1924,18 +2541,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *____bpf_prog_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_prog_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_prog_add(struct bpf_prog *prog, int i)
{
atomic64_add(i, &prog->aux->refcnt);
@@ -1982,7 +2587,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -1991,20 +2596,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_prog *prog;
- prog = ____bpf_prog_get(f);
- if (IS_ERR(prog))
- return prog;
- if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
- prog = ERR_PTR(-EINVAL);
- goto out;
- }
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_prog_fops)
+ return ERR_PTR(-EINVAL);
+
+ prog = fd_file(f)->private_data;
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
+ return ERR_PTR(-EINVAL);
bpf_prog_inc(prog);
-out:
- fdput(f);
return prog;
}
@@ -2099,14 +2703,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -2139,6 +2748,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_NETFILTER:
+ if (expected_attach_type == BPF_NETFILTER)
+ return 0;
+ return -EINVAL;
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
@@ -2161,7 +2774,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2170,6 +2782,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
@@ -2198,17 +2811,71 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
+static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
+ bool is_kernel)
+{
+ bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
+ struct bpf_dynptr_kern sig_ptr, insns_ptr;
+ struct bpf_key *key = NULL;
+ void *sig;
+ int err = 0;
+
+ if (system_keyring_id_check(attr->keyring_id) == 0)
+ key = bpf_lookup_system_key(attr->keyring_id);
+ else
+ key = bpf_lookup_user_key(attr->keyring_id, 0);
+
+ if (!key)
+ return -EINVAL;
+
+ sig = kvmemdup_bpfptr(usig, attr->signature_size);
+ if (IS_ERR(sig)) {
+ bpf_key_put(key);
+ return -ENOMEM;
+ }
+
+ bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
+ attr->signature_size);
+ bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
+ prog->len * sizeof(struct bpf_insn));
+
+ err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
+ (struct bpf_dynptr *)&sig_ptr, key);
+
+ bpf_key_put(key);
+ kvfree(sig);
+ return err;
+}
+
+static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
+ continue;
+
+ err = bpf_insn_array_ready(prog->aux->used_maps[i]);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define BPF_PROG_LOAD_LAST_FIELD keyring_id
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
+ struct bpf_token *token = NULL;
+ bool bpf_cap;
int err;
char license[128];
- bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
@@ -2217,36 +2884,63 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
- BPF_F_TEST_RND_HI32))
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY |
+ BPF_F_TEST_REG_INVARIANTS |
+ BPF_F_TOKEN_FD))
return -EINVAL;
- if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
- (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !bpf_capable())
- return -EPERM;
+ bpf_prog_load_fixup_attach_type(attr);
- /* copy eBPF program license from user space */
- if (strncpy_from_bpfptr(license,
- make_bpfptr(attr->license, uattr.is_kernel),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
+ if (attr->prog_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->prog_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant prog loading permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+ !bpf_token_allow_prog_type(token, attr->prog_type,
+ attr->expected_attach_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- is_gpl = license_is_gpl_compatible(license);
+ bpf_cap = bpf_token_capable(token, CAP_BPF);
+ err = -EPERM;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+ !bpf_cap)
+ goto put_token;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF program
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out for these
+ * and other operations.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+ goto put_token;
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
- return -E2BIG;
+ attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+ err = -E2BIG;
+ goto put_token;
+ }
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
- if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (is_perfmon_prog_type(type) && !perfmon_capable())
- return -EPERM;
+ if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+ goto put_token;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
@@ -2256,27 +2950,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
- if (IS_ERR(attach_btf))
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = -EINVAL;
+ goto put_token;
+ }
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
- return -ENOTSUPP;
+ err = -ENOTSUPP;
+ goto put_token;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(attach_btf))
- return PTR_ERR(attach_btf);
- if (!attach_btf)
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = PTR_ERR(attach_btf);
+ goto put_token;
+ }
+ if (!attach_btf) {
+ err = -EINVAL;
+ goto put_token;
+ }
btf_get(attach_btf);
}
- bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
@@ -2284,7 +2984,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -EINVAL;
+ err = -EINVAL;
+ goto put_token;
}
/* plain bpf_prog allocation */
@@ -2294,19 +2995,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -ENOMEM;
+ err = -EINVAL;
+ goto put_token;
}
prog->expected_attach_type = attr->expected_attach_type;
+ prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
- prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
- err = security_bpf_prog_alloc(prog->aux);
- if (err)
- goto free_prog;
+ /* move token into prog->aux, reuse taken refcnt */
+ prog->aux->token = token;
+ token = NULL;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
@@ -2315,33 +3018,74 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
if (copy_from_bpfptr(prog->insns,
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
- goto free_prog_sec;
+ goto free_prog;
+ /* copy eBPF program license from user space */
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
+ goto free_prog;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
+
+ if (attr->signature) {
+ err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+ if (err)
+ goto free_prog;
+ }
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
- goto free_prog_sec;
+ goto free_prog;
+ }
+
+ /*
+ * Bookkeeping for managing the program attachment chain.
+ *
+ * It might be tempting to set attach_tracing_prog flag at the attachment
+ * time, but this will not prevent from loading bunch of tracing prog
+ * first, then attach them one to another.
+ *
+ * The flag attach_tracing_prog is set for the whole program lifecycle, and
+ * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+ * programs cannot change attachment target.
+ */
+ if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+ dst_prog->type == BPF_PROG_TYPE_TRACING) {
+ prog->aux->attach_tracing_prog = true;
}
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
- goto free_prog_sec;
+ goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
+ goto free_prog;
+
+ err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
+ if (err)
goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2349,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
if (err < 0)
goto free_used_maps;
+ err = bpf_prog_mark_insn_arrays_ready(prog);
+ if (err < 0)
+ goto free_used_maps;
+
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
@@ -2381,46 +3129,83 @@ free_used_maps:
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
- __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
+
free_prog_sec:
- free_uid(prog->aux->user);
- security_bpf_prog_free(prog->aux);
+ security_bpf_prog_free(prog);
free_prog:
+ free_uid(prog->aux->user);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
+put_token:
+ bpf_token_put(token);
return err;
}
-#define BPF_OBJ_LAST_FIELD file_flags
+#define BPF_OBJ_LAST_FIELD path_fd
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_pin_user(attr->bpf_fd, path_fd,
+ u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
{
+ int path_fd;
+
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
- attr->file_flags & ~BPF_OBJ_FLAG_MASK)
+ attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
- const struct bpf_link_ops *ops, struct bpf_prog *prog)
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type, bool sleepable)
{
+ WARN_ON(ops->dealloc && ops->dealloc_deferred);
atomic64_set(&link->refcnt, 1);
link->type = type;
+ link->sleepable = sleepable;
link->id = 0;
link->ops = ops;
link->prog = prog;
+ link->attach_type = attach_type;
+}
+
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
}
static void bpf_link_free_id(int id)
@@ -2435,10 +3220,12 @@ static void bpf_link_free_id(int id)
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper marksbpf_link as
+ * anon_inode's release() call. This helper marks bpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
@@ -2453,17 +3240,59 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_dealloc(struct bpf_link *link)
+{
+ /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
+ if (link->prog)
+ bpf_prog_put(link->prog);
+
+ /* free bpf_link and its containing memory */
+ if (link->ops->dealloc_deferred)
+ link->ops->dealloc_deferred(link);
+ else
+ link->ops->dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+ struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+ bpf_link_dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_link_defer_dealloc_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ const struct bpf_link_ops *ops = link->ops;
+
bpf_link_free_id(link->id);
- if (link->prog) {
- /* detach BPF program, clean up used resources */
- link->ops->release(link);
- bpf_prog_put(link->prog);
+ /* detach BPF program, clean up used resources */
+ if (link->prog)
+ ops->release(link);
+ if (ops->dealloc_deferred) {
+ /* Schedule BPF link deallocation, which will only then
+ * trigger putting BPF program refcount.
+ * If underlying BPF program is sleepable or BPF link's target
+ * attach hookpoint is sleepable or otherwise requires RCU GPs
+ * to ensure link and its underlying BPF program is not
+ * reachable anymore, we need to first wait for RCU tasks
+ * trace sync, and then go through "classic" RCU grace period
+ */
+ if (link->sleepable || (link->prog && link->prog->sleepable))
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ } else if (ops->dealloc) {
+ bpf_link_dealloc(link);
}
- /* free bpf_link and its containing memory */
- link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -2473,27 +3302,31 @@ static void bpf_link_put_deferred(struct work_struct *work)
bpf_link_free(link);
}
-/* bpf_link_put can be called from atomic context, but ensures that resources
- * are freed from process context
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
- if (in_atomic()) {
- INIT_WORK(&link->work, bpf_link_put_deferred);
- schedule_work(&link->work);
- } else {
- bpf_link_free(link);
- }
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
+}
+EXPORT_SYMBOL(bpf_link_put);
+
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
}
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return 0;
}
@@ -2513,23 +3346,44 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
+ enum bpf_link_type type = link->type;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
- seq_printf(m,
- "link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
- bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
+ if (link->type == BPF_LINK_TYPE_KPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
+ "kretprobe_multi" : "kprobe_multi");
+ else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+ "uretprobe_multi" : "uprobe_multi");
+ else
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ } else {
+ WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
+ seq_printf(m, "link_type:\t<%u>\n", type);
+ }
+ seq_printf(m, "link_id:\t%u\n", link->id);
+
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
#endif
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
@@ -2539,6 +3393,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
@@ -2559,7 +3423,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2581,7 +3445,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}
- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
@@ -2609,42 +3475,35 @@ int bpf_link_settle(struct bpf_link_primer *primer)
int bpf_link_new_fd(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_link *link;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
return ERR_PTR(-EINVAL);
- }
- link = f.file->private_data;
+ link = fd_file(f)->private_data;
bpf_link_inc(link);
- fdput(f);
-
return link;
}
-
-struct bpf_tracing_link {
- struct bpf_link link;
- enum bpf_attach_type attach_type;
- struct bpf_trampoline *trampoline;
- struct bpf_prog *tgt_prog;
-};
+EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL");
static void bpf_tracing_link_release(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
- tr_link->trampoline));
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
+ tr_link->trampoline,
+ tr_link->tgt_prog));
bpf_trampoline_put(tr_link->trampoline);
@@ -2656,7 +3515,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
kfree(tr_link);
}
@@ -2665,20 +3524,30 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
+ u32 target_btf_id, target_obj_id;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &target_obj_id, &target_btf_id);
seq_printf(seq,
- "attach_type:\t%d\n",
- tr_link->attach_type);
+ "attach_type:\t%d\n"
+ "target_obj_id:\t%u\n"
+ "target_btf_id:\t%u\n"
+ "cookie:\t%llu\n",
+ link->attach_type,
+ target_obj_id,
+ target_btf_id,
+ tr_link->link.cookie);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
- info->tracing.attach_type = tr_link->attach_type;
+ info->tracing.attach_type = link->attach_type;
+ info->tracing.cookie = tr_link->link.cookie;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
&info->tracing.target_obj_id,
&info->tracing.target_btf_id);
@@ -2695,7 +3564,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
static int bpf_tracing_prog_attach(struct bpf_prog *prog,
int tgt_prog_fd,
- u32 btf_id)
+ u32 btf_id,
+ u64 bpf_cookie,
+ enum bpf_attach_type attach_type)
{
struct bpf_link_primer link_primer;
struct bpf_prog *tgt_prog = NULL;
@@ -2736,7 +3607,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
}
if (tgt_prog_fd) {
- /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
+ /*
+ * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+ * part would be changed to implement the same for
+ * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+ * attach_tracing_prog flag is set.
+ */
if (prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
@@ -2757,9 +3633,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
- &bpf_tracing_link_lops, prog);
- link->attach_type = prog->expected_attach_type;
+ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type);
+
+ link->link.cookie = bpf_cookie;
mutex_lock(&prog->aux->dst_mutex);
@@ -2770,7 +3647,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
- * attached to a target and its initial target was cleared (below)
+ * attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
@@ -2780,6 +3657,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
*
* - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
* was detached and is going for re-attachment.
+ *
+ * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
+ * are NULL, then program was already attached and user did not provide
+ * tgt_prog_fd so we have no way to find out or create trampoline
*/
if (!prog->aux->dst_trampoline && !tgt_prog) {
/*
@@ -2793,6 +3674,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
err = -EINVAL;
goto out_unlock;
}
+ /* We can allow re-attach only if we have valid attach_btf. */
+ if (!prog->aux->attach_btf) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
btf_id = prog->aux->attach_btf_id;
key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
@@ -2810,6 +3696,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
@@ -2827,11 +3718,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tgt_prog = prog->aux->dst_prog;
}
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(prog, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -2869,17 +3760,12 @@ out_put_prog:
return err;
}
-struct bpf_raw_tp_link {
- struct bpf_link link;
- struct bpf_raw_event_map *btp;
-};
-
static void bpf_raw_tp_link_release(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
- bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+ bpf_probe_unregister(raw_tp->btp, raw_tp);
bpf_put_raw_tracepoint(raw_tp->btp);
}
@@ -2898,8 +3784,29 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_raw_tp_link, link);
seq_printf(seq,
- "tp_name:\t%s\n",
- raw_tp_link->btp->tp->name);
+ "tp_name:\t%s\n"
+ "cookie:\t%llu\n",
+ raw_tp_link->btp->tp->name,
+ raw_tp_link->cookie);
+}
+
+static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
+{
+ if (ulen >= len + 1) {
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, buf, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
}
static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
@@ -2916,29 +3823,17 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
+ info->raw_tracepoint.cookie = raw_tp_link->cookie;
if (!ubuf)
return 0;
- if (ulen >= tp_len + 1) {
- if (copy_to_user(ubuf, tp_name, tp_len + 1))
- return -EFAULT;
- } else {
- char zero = '\0';
-
- if (copy_to_user(ubuf, tp_name, ulen - 1))
- return -EFAULT;
- if (put_user(zero, ubuf + ulen - 1))
- return -EFAULT;
- return -ENOSPC;
- }
-
- return 0;
+ return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
- .dealloc = bpf_raw_tp_link_dealloc,
+ .dealloc_deferred = bpf_raw_tp_link_dealloc,
.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
@@ -2965,9 +3860,296 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
kfree(perf_link);
}
+static int bpf_perf_link_fill_common(const struct perf_event *event,
+ char __user *uname, u32 *ulenp,
+ u64 *probe_offset, u64 *probe_addr,
+ u32 *fd_type, unsigned long *missed)
+{
+ const char *buf;
+ u32 prog_id, ulen;
+ size_t len;
+ int err;
+
+ ulen = *ulenp;
+ if (!ulen ^ !uname)
+ return -EINVAL;
+
+ err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
+ probe_offset, probe_addr, missed);
+ if (err)
+ return err;
+
+ if (buf) {
+ len = strlen(buf);
+ *ulenp = len + 1;
+ } else {
+ *ulenp = 1;
+ }
+ if (!uname)
+ return 0;
+
+ if (buf) {
+ err = bpf_copy_to_user(uname, buf, ulen, len);
+ if (err)
+ return err;
+ } else {
+ char zero = '\0';
+
+ if (put_user(zero, uname))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ unsigned long missed;
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
+ ulen = info->perf_event.kprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
+ &type, &missed);
+ if (err)
+ return err;
+ if (type == BPF_FD_TYPE_KRETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_KPROBE;
+ info->perf_event.kprobe.name_len = ulen;
+ info->perf_event.kprobe.offset = offset;
+ info->perf_event.kprobe.missed = missed;
+ if (!kallsyms_show_value(current_cred()))
+ addr = 0;
+ info->perf_event.kprobe.addr = addr;
+ info->perf_event.kprobe.cookie = event->bpf_cookie;
+ return 0;
+}
+
+static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, addr;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &addr, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "missed:\t%lu\n"
+ "addr:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, missed, addr,
+ type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe",
+ event->bpf_cookie);
+}
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ u64 ref_ctr_offset, offset;
+ char __user *uname;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
+ ulen = info->perf_event.uprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
+ &type, NULL);
+ if (err)
+ return err;
+
+ if (type == BPF_FD_TYPE_URETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.name_len = ulen;
+ info->perf_event.uprobe.offset = offset;
+ info->perf_event.uprobe.cookie = event->bpf_cookie;
+ info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
+ return 0;
+}
+
+static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, ref_ctr_offset;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &ref_ctr_offset, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "ref_ctr_offset:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, ref_ctr_offset,
+ type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe",
+ event->bpf_cookie);
+}
+#endif
+
+static int bpf_perf_link_fill_probe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fill_kprobe(event, info);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fill_uprobe(event, info);
+#endif
+ return -EOPNOTSUPP;
+}
+
+static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u32 ulen;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
+ ulen = info->perf_event.tracepoint.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL);
+ if (err)
+ return err;
+
+ info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ info->perf_event.tracepoint.name_len = ulen;
+ info->perf_event.tracepoint.cookie = event->bpf_cookie;
+ return 0;
+}
+
+static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ info->perf_event.event.type = event->attr.type;
+ info->perf_event.event.config = event->attr.config;
+ info->perf_event.event.cookie = event->bpf_cookie;
+ info->perf_event.type = BPF_PERF_EVENT_EVENT;
+ return 0;
+}
+
+static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_link_fill_perf_event(event, info);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_perf_link_fill_tracepoint(event, info);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_perf_link_fill_probe(event, info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ seq_printf(seq,
+ "type:\t%u\n"
+ "config:\t%llu\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ event->attr.type, event->attr.config,
+ "event", event->bpf_cookie);
+}
+
+static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ int err;
+ const char *name;
+ u32 prog_id;
+
+ err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL,
+ NULL, NULL);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "tp_name:\t%s\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, "tracepoint", event->bpf_cookie);
+}
+
+static void bpf_probe_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fdinfo_kprobe(event, seq);
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fdinfo_uprobe(event, seq);
+#endif
+}
+
+static void bpf_perf_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return;
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_event_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_tracepoint_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_probe_link_show_fdinfo(event, seq);
+ default:
+ return;
+ }
+}
+
static const struct bpf_link_ops bpf_perf_link_lops = {
.release = bpf_perf_link_release,
.dealloc = bpf_perf_link_dealloc,
+ .fill_link_info = bpf_perf_link_fill_link_info,
+ .show_fdinfo = bpf_perf_link_show_fdinfo,
};
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -2990,7 +4172,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
err = -ENOMEM;
goto out_put_file;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog,
+ attr->link_create.attach_type);
link->perf_file = perf_file;
err = bpf_link_prime(&link->link, &link_primer);
@@ -3014,77 +4197,64 @@ out_put_file:
fput(perf_file);
return err;
}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_PERF_EVENTS */
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+ const char __user *user_tp_name, u64 cookie,
+ enum bpf_attach_type attach_type)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
const char *tp_name;
char buf[128];
int err;
- if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
- return -EINVAL;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_EXT:
case BPF_PROG_TYPE_LSM:
- if (attr->raw_tracepoint.name) {
+ if (user_tp_name)
/* The attach point for this category of programs
* should be specified via btf_id during program load.
*/
- err = -EINVAL;
- goto out_put_prog;
- }
+ return -EINVAL;
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP) {
tp_name = prog->aux->attach_func_name;
break;
}
- err = bpf_tracing_prog_attach(prog, 0, 0);
- if (err >= 0)
- return err;
- goto out_put_prog;
+ return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
- if (strncpy_from_user(buf,
- u64_to_user_ptr(attr->raw_tracepoint.name),
- sizeof(buf) - 1) < 0) {
- err = -EFAULT;
- goto out_put_prog;
- }
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
default:
- err = -EINVAL;
- goto out_put_prog;
+ return -EINVAL;
}
btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp) {
- err = -ENOENT;
- goto out_put_prog;
- }
+ if (!btp)
+ return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
- &bpf_raw_tp_link_lops, prog);
+ bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog, attach_type,
+ tracepoint_is_faultable(btp->tp));
link->btp = btp;
+ link->cookie = cookie;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -3092,7 +4262,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
goto out_put_btp;
}
- err = bpf_probe_register(link->btp, prog);
+ err = bpf_probe_register(link->btp, link);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
@@ -3102,32 +4272,31 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
out_put_btp:
bpf_put_raw_tracepoint(btp);
-out_put_prog:
- bpf_prog_put(prog);
return err;
}
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
- enum bpf_attach_type attach_type)
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
- switch (prog->type) {
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_SK_LOOKUP:
- return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
- case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
- /* cg-skb progs can be loaded by unpriv user.
- * check permissions at attach time.
- */
- return -EPERM;
- return prog->enforce_expected_attach_type &&
- prog->expected_attach_type != attach_type ?
- -EINVAL : 0;
- default:
- return 0;
- }
+ struct bpf_prog *prog;
+ void __user *tp_name;
+ __u64 cookie;
+ int fd;
+
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ cookie = attr->raw_tracepoint.cookie;
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
+ if (fd < 0)
+ bpf_prog_put(prog);
+ return fd;
}
static enum bpf_prog_type
@@ -3146,14 +4315,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
case BPF_CGROUP_SOCK_OPS:
return BPF_PROG_TYPE_SOCK_OPS;
@@ -3175,20 +4349,133 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
+ case BPF_LSM_MAC:
+ return BPF_PROG_TYPE_LSM;
case BPF_SK_LOOKUP:
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+
+ ptype = attach_type_to_prog_type(attach_type);
+ if (prog->type != ptype)
+ return -EINVAL;
+
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_KPROBE_SESSION)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS &&
+ attach_type != BPF_NETKIT_PRIMARY &&
+ attach_type != BPF_NETKIT_PEER)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
+static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
+ bool check_atype)
+{
+ switch (ptype) {
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return true;
+ case BPF_PROG_TYPE_LSM:
+ return check_atype ? atype == BPF_LSM_CGROUP : true;
+ default:
+ return false;
+ }
+}
+
+#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
-#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
+#define BPF_F_ATTACH_MASK_BASE \
+ (BPF_F_ALLOW_OVERRIDE | \
+ BPF_F_ALLOW_MULTI | \
+ BPF_F_REPLACE | \
+ BPF_F_PREORDER)
+
+#define BPF_F_ATTACH_MASK_MPROG \
+ (BPF_F_REPLACE | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_ID | \
+ BPF_F_LINK)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -3199,12 +4486,22 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
- return -EINVAL;
-
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
+ if (bpf_mprog_supported(ptype)) {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
+ return -EINVAL;
+ } else {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
+ return -EINVAL;
+ if (attr->relative_fd ||
+ attr->expected_revision)
+ return -EINVAL;
+ }
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
@@ -3215,6 +4512,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return -EINVAL;
}
+ if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ goto out;
+ }
+
switch (ptype) {
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
@@ -3226,43 +4528,64 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
- case BPF_PROG_TYPE_CGROUP_DEVICE:
- case BPF_PROG_TYPE_CGROUP_SKB:
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_CGROUP_SYSCTL:
- case BPF_PROG_TYPE_SOCK_OPS:
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_attach(attr, prog);
+ else
+ ret = netkit_prog_attach(attr, prog);
break;
default:
ret = -EINVAL;
}
-
+out:
if (ret)
bpf_prog_put(prog);
return ret;
}
-#define BPF_PROG_DETACH_LAST_FIELD attach_type
+#define BPF_PROG_DETACH_LAST_FIELD expected_revision
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ struct bpf_prog *prog = NULL;
enum bpf_prog_type ptype;
+ int ret;
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
+ if (bpf_mprog_supported(ptype)) {
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ if (attr->attach_bpf_fd) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags || attr->relative_fd)
+ return -EINVAL;
+ } else if (attr->attach_flags ||
+ attr->relative_fd ||
+ attr->expected_revision) {
+ return -EINVAL;
+ }
switch (ptype) {
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_SK_SKB:
- return sock_map_prog_detach(attr, ptype);
+ ret = sock_map_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_LIRC_MODE2:
- return lirc_prog_detach(attr);
+ ret = lirc_prog_detach(attr);
+ break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return netns_bpf_prog_detach(attr, ptype);
+ ret = netns_bpf_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3270,18 +4593,31 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
- return cgroup_bpf_prog_detach(attr, ptype);
+ case BPF_PROG_TYPE_LSM:
+ ret = cgroup_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_detach(attr, prog);
+ else
+ ret = netkit_prog_detach(attr, prog);
+ break;
default:
- return -EINVAL;
+ ret = -EINVAL;
}
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.revision
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_net_capable())
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
return -EINVAL;
@@ -3299,31 +4635,48 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return tcx_prog_query(attr, uattr);
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return netkit_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3563,6 +4916,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
continue;
}
+ if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+ BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+ insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
+
if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
@@ -3616,6 +4975,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_kstats stats;
@@ -3669,6 +5029,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -3687,22 +5049,22 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_insn *insns_sanitized;
bool fault;
- if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
+ if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
+ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
+ if (!insns_sanitized)
+ return -ENOMEM;
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ fault = copy_to_user(uinsns, insns_sanitized, ulen);
+ kfree(insns_sanitized);
+ if (fault)
+ return -EFAULT;
+ } else {
info.xlated_prog_insns = 0;
- goto done;
}
- insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
- if (!insns_sanitized)
- return -ENOMEM;
- uinsns = u64_to_user_ptr(info.xlated_prog_insns);
- ulen = min_t(u32, info.xlated_prog_len, ulen);
- fault = copy_to_user(uinsns, insns_sanitized, ulen);
- kfree(insns_sanitized);
- if (fault)
- return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -3815,8 +5177,9 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
}
- if (prog->aux->btf)
- info.btf_id = btf_obj_id(prog->aux->btf);
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -3849,14 +5212,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
- if (put_user((__u64)(long)prog->aux->jited_linfo[i],
- &user_linfo[i]))
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
return -EFAULT;
}
} else {
@@ -3910,6 +5274,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -3925,13 +5292,34 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+ bpf_map_struct_ops_info_fill(&info, map);
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
}
+ if (info.hash) {
+ char __user *uhash = u64_to_user_ptr(info.hash);
+
+ if (!map->ops->map_get_hash)
+ return -EINVAL;
+
+ if (info.hash_size != SHA256_DIGEST_SIZE)
+ return -EINVAL;
+
+ err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+ if (err != 0)
+ return err;
+
+ if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+ return -EFAULT;
+ } else if (info.hash_size) {
+ return -EINVAL;
+ }
+
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
@@ -3976,7 +5364,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -3992,62 +5381,110 @@ static int bpf_link_get_info_by_fd(struct file *file,
}
+static int token_get_info_by_fd(struct file *file,
+ struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+ return bpf_token_get_info_by_fd(token, attr, uattr);
+}
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- int ufd = attr->info.bpf_fd;
- struct fd f;
- int err;
-
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
- f = fdget(ufd);
- if (!f.file)
+ CLASS(fd, f)(attr->info.bpf_fd);
+ if (fd_empty(f))
return -EBADFD;
- if (f.file->f_op == &bpf_prog_fops)
- err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &bpf_map_fops)
- err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &btf_fops)
- err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops)
- err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+ else if (fd_file(f)->f_op == &btf_fops)
+ return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
- else
- err = -EINVAL;
-
- fdput(f);
- return err;
+ else if (fd_file(f)->f_op == &bpf_token_fops)
+ return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
+ attr, uattr);
+ return -EINVAL;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!bpf_capable())
+ if (attr->btf_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->btf_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->btf_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
- return btf_new_fd(attr, uattr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
-#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (attr->open_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->open_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->fd_by_id_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_SYS_ADMIN)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_get_fd_by_id(attr->btf_id);
}
@@ -4072,21 +5509,10 @@ static int bpf_task_fd_query_copy(const union bpf_attr *attr,
if (put_user(zero, ubuf))
return -EFAULT;
- } else if (input_len >= len + 1) {
- /* ubuf can hold the string with NULL terminator */
- if (copy_to_user(ubuf, buf, len + 1))
- return -EFAULT;
} else {
- /* ubuf cannot hold the string with NULL terminator,
- * do a partial copy with NULL terminator.
- */
- char zero = '\0';
-
- err = -ENOSPC;
- if (copy_to_user(ubuf, buf, input_len - 1))
- return -EFAULT;
- if (put_user(zero, ubuf + input_len - 1))
- return -EFAULT;
+ err = bpf_copy_to_user(ubuf, buf, input_len, len);
+ if (err == -EFAULT)
+ return err;
}
}
@@ -4120,7 +5546,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4130,7 +5558,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;
- if (file->f_op == &bpf_link_fops) {
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
@@ -4155,7 +5583,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
- &probe_addr);
+ &probe_addr, NULL);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
@@ -4173,13 +5601,13 @@ put_file:
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
@@ -4190,14 +5618,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
- int err, ufd;
- struct fd f;
+ int err;
if (CHECK_ATTR(BPF_MAP_BATCH))
return -EINVAL;
- ufd = attr->batch.map_fd;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->batch.map_fd);
+
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -4213,45 +5640,33 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
- if (has_write)
+ if (has_write) {
+ maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
- fdput(f);
+ }
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
- struct bpf_prog *prog)
-{
- if (attr->link_create.attach_type != prog->expected_attach_type)
- return -EINVAL;
-
- if (prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, uattr, prog);
- else if (prog->type == BPF_PROG_TYPE_EXT)
- return bpf_tracing_prog_attach(prog,
- attr->link_create.target_fd,
- attr->link_create.target_btf_id);
- return -EINVAL;
-}
-
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
- enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4262,28 +5677,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
switch (prog->type) {
- case BPF_PROG_TYPE_EXT:
- ret = tracing_bpf_link_attach(attr, uattr, prog);
- goto out;
- case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_KPROBE:
- case BPF_PROG_TYPE_TRACEPOINT:
- if (attr->link_create.attach_type != BPF_PERF_EVENT) {
- ret = -EINVAL;
- goto out;
- }
- ptype = prog->type;
- break;
- default:
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
- goto out;
- }
- break;
- }
-
- switch (ptype) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4293,25 +5686,70 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_EXT:
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
+ break;
+ case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, uattr, prog);
+ if (attr->link_create.attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
+ else if (prog->expected_attach_type == BPF_TRACE_ITER)
+ ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
+ else
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_link_create(attr, prog);
+ break;
#ifdef CONFIG_NET
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
+ ret = tcx_link_attach(attr, prog);
+ else
+ ret = netkit_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
#endif
-#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
- case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
-#endif
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -4322,6 +5760,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4342,6 +5809,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -4371,7 +5843,7 @@ out_put_progs:
if (ret)
bpf_prog_put(new_prog);
out_put_link:
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4394,14 +5866,15 @@ static int link_detach(union bpf_attr *attr)
else
ret = -EOPNOTSUPP;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
-static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
struct bpf_link *bpf_link_by_id(u32 id)
{
@@ -4425,6 +5898,25 @@ struct bpf_link *bpf_link_by_id(u32 id)
return link;
}
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+ struct bpf_link *link;
+
+ spin_lock_bh(&link_idr_lock);
+again:
+ link = idr_get_next(&link_idr, id);
+ if (link) {
+ link = bpf_link_inc_not_zero(link);
+ if (IS_ERR(link)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ return link;
+}
+
#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
@@ -4445,7 +5937,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_link_new_fd(link);
if (fd < 0)
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return fd;
}
@@ -4522,7 +6014,7 @@ static int bpf_iter_create(union bpf_attr *attr)
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return err;
}
@@ -4570,6 +6062,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
+ /* The bpf program will not access the bpf map, but for the sake of
+ * simplicity, increase sleepable_refcnt for sleepable program as well.
+ */
+ if (prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
@@ -4589,14 +6086,47 @@ out_prog_put:
return ret;
}
-static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_TOKEN_CREATE))
+ return -EINVAL;
+
+ /* no flags are supported yet */
+ if (attr->token_create.flags)
+ return -EINVAL;
+
+ return bpf_token_create(attr);
+}
+
+#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd
+
+static int prog_stream_read(union bpf_attr *attr)
+{
+ char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf);
+ u32 len = attr->prog_stream_read.stream_buf_len;
+ struct bpf_prog *prog;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_stream_read.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len);
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
-
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
@@ -4607,13 +6137,13 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
- err = security_bpf(cmd, &attr, size);
+ err = security_bpf(cmd, &attr, size, uattr.is_kernel);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr);
+ err = map_create(&attr, uattr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
@@ -4622,7 +6152,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -4631,7 +6161,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -4676,7 +6206,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
@@ -4725,6 +6255,12 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_BIND_MAP:
err = bpf_prog_bind_map(&attr);
break;
+ case BPF_TOKEN_CREATE:
+ err = token_create(&attr);
+ break;
+ case BPF_PROG_STREAM_READ_BY_FD:
+ err = prog_stream_read(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -4750,24 +6286,75 @@ static bool syscall_prog_is_valid_access(int off, int size,
return true;
}
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
break;
- /* case BPF_PROG_TEST_RUN:
- * is not part of this list to prevent recursive test_run
- */
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ run_ctx.bpf_cookie = 0;
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
+ /* recursion detected */
+ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
+ bpf_prog_put(prog);
+ return 0;
+#endif
+ default:
+ return ____bpf_sys_bpf(cmd, attr, size);
+ }
+}
+EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL");
+
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
.gpl_only = false,
@@ -4780,7 +6367,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -4802,6 +6389,7 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
+ *res = 0;
if (flags)
return -EINVAL;
@@ -4815,14 +6403,15 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag
return *res ? 0 : -ENOENT;
}
-const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
static const struct bpf_func_proto *
@@ -4830,7 +6419,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return &bpf_sys_bpf_proto;
+ return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+ ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
@@ -4850,3 +6440,89 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
const struct bpf_prog_ops bpf_syscall_prog_ops = {
.test_run = bpf_prog_test_run_syscall,
};
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ if (write)
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+
+static const struct ctl_table bpf_syscall_table[] = {
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .mode = 0644,
+ .proc_handler = bpf_stats_handler,
+ },
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+ register_sysctl_init("kernel", bpf_syscall_table);
+ return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index ef6911aee3bb..9cbe15ce3540 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,32 +7,56 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
-static ssize_t
-btf_vmlinux_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
{
- memcpy(buf, __start_BTF + off, len);
- return len;
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = __pa_symbol(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read = btf_vmlinux_read,
+ .read = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
+ bin_attr_btf_vmlinux.private = __start_BTF;
bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
- if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
+ if (bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d94696198ef8..98d9b4c0daff 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,13 +5,23 @@
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
-#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
#include "mmap_unlock_work.h"
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
+
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -22,18 +32,93 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task;
+ struct pid *pid;
+ u32 next_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
+ if (!task)
+ return NULL;
+
+retry:
+ task = __next_thread(task);
+ if (!task)
+ return NULL;
+
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid)
+ goto retry;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files)
+ goto retry;
+
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_PID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -56,7 +141,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -73,7 +158,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -99,7 +184,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
if (!prog)
return 0;
- meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
@@ -118,6 +202,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -138,10 +257,10 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
+ struct file *f;
/* If this function returns a non-NULL file object,
* it held a reference to the task/file.
@@ -152,44 +271,38 @@ again:
curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
info->task = NULL;
- info->tid = curr_tid;
return NULL;
}
- /* set info->task and info->tid */
+ /* set info->task */
info->task = curr_task;
- if (curr_tid == info->tid) {
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
- rcu_read_lock();
- for (;; curr_fd++) {
- struct file *f;
- f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
- if (!f)
- break;
- if (!get_file_rcu(f))
- continue;
-
+ f = fget_task_next(curr_task, &curr_fd);
+ if (f) {
/* set info->fd */
info->fd = curr_fd;
- rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
- rcu_read_unlock();
put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
@@ -270,6 +383,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -293,6 +409,7 @@ struct bpf_iter_seq_task_vma_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
+ struct mm_struct *mm;
struct vm_area_struct *vma;
u32 tid;
unsigned long prev_vm_start;
@@ -300,28 +417,30 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
- struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
- u32 curr_tid = info->tid;
+ struct mm_struct *curr_mm;
+ u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
- * the task_struct, and holds read lock on vma->mm->mmap_lock.
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
* If this function returns NULL, it does not hold any reference or
* lock.
*/
if (info->task) {
curr_task = info->task;
curr_vma = info->vma;
+ curr_mm = info->mm;
/* In case of lock contention, drop mmap_lock to unblock
* the writer.
*
@@ -360,26 +479,27 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
* 4.2) VMA2 and VMA2' covers different ranges, process
* VMA2'.
*/
- if (mmap_lock_is_contended(curr_task->mm)) {
+ if (mmap_lock_is_contended(curr_mm)) {
info->prev_vm_start = curr_vma->vm_start;
info->prev_vm_end = curr_vma->vm_end;
op = task_vma_iter_find_vma;
- mmap_read_unlock(curr_task->mm);
- if (mmap_read_lock_killable(curr_task->mm))
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
} else {
op = task_vma_iter_next_vma;
}
} else {
again:
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
- info->tid = curr_tid + 1;
+ info->tid++;
goto finish;
}
- if (curr_tid != info->tid) {
- info->tid = curr_tid;
+ if (saved_tid != info->tid) {
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
@@ -392,48 +512,57 @@ again:
op = task_vma_iter_find_vma;
}
- if (!curr_task->mm)
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
goto next_task;
- if (mmap_read_lock_killable(curr_task->mm))
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
}
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma. This is similar to the mechanism
* in show_smaps_rollup().
*/
- curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
- mmap_read_unlock(curr_task->mm);
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
goto next_task;
}
info->task = curr_task;
info->vma = curr_vma;
+ info->mm = curr_mm;
return curr_vma;
next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
put_task_struct(curr_task);
info->task = NULL;
- curr_tid++;
+ info->mm = NULL;
+ info->tid++;
goto again;
finish:
@@ -441,6 +570,7 @@ finish:
put_task_struct(curr_task);
info->task = NULL;
info->vma = NULL;
+ info->mm = NULL;
return NULL;
}
@@ -512,7 +642,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v)
*/
info->prev_vm_start = ~0UL;
info->prev_vm_end = info->vma->vm_end;
- mmap_read_unlock(info->task->mm);
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
put_task_struct(info->task);
info->task = NULL;
}
@@ -532,15 +664,42 @@ static const struct bpf_iter_seq_info task_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
};
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
static struct bpf_iter_reg task_reg_info = {
.target = "task",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -552,6 +711,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
static struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -561,6 +721,8 @@ static struct bpf_iter_reg task_file_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -572,6 +734,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -581,6 +744,8 @@ static struct bpf_iter_reg task_vma_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
@@ -629,6 +794,241 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__bpf_kfunc_end_defs();
+
+#ifdef CONFIG_CGROUPS
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__bpf_kfunc_end_defs();
+
+#endif /* CONFIG_CGROUPS */
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
+};
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ kit->pos = NULL;
+
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = __next_thread(kit->pos);
+ if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
+ return pos;
+
+get_next_task:
+ kit->task = next_task(kit->task);
+ if (kit->task == &init_task)
+ kit->pos = NULL;
+ else
+ kit->pos = kit->task;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__bpf_kfunc_end_defs();
+
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
new file mode 100644
index 000000000000..efd987ea6872
--- /dev/null
+++ b/kernel/bpf/tcx.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+#include <linux/netdevice.h>
+
+#include <net/tcx.h>
+
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool created, ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+void tcx_uninstall(struct net_device *dev, bool ingress)
+{
+ struct bpf_mprog_entry *entry, *entry_new = NULL;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ bool active;
+
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry)
+ return;
+ active = tcx_entry(entry)->miniq_active;
+ if (active)
+ bpf_mprog_clear_all(entry, &entry_new);
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ tcx_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ tcx_skeys_dec(ingress);
+ }
+ if (!active)
+ tcx_entry_free(entry);
+}
+
+int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->query.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress));
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool created, ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = tcx->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry)
+ return -ENOMEM;
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+ return ret;
+}
+
+static void tcx_link_release(struct bpf_link *link)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev)
+ goto out;
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ tcx->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void tcx_link_dealloc(struct bpf_link *link)
+{
+ kfree(tcx_link(link));
+}
+
+static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ link->attach_type,
+ link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress");
+}
+
+static int tcx_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ info->tcx.ifindex = ifindex;
+ info->tcx.attach_type = link->attach_type;
+ return 0;
+}
+
+static int tcx_link_detach(struct bpf_link *link)
+{
+ tcx_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops tcx_link_lops = {
+ .release = tcx_link_release,
+ .detach = tcx_link_detach,
+ .dealloc = tcx_link_dealloc,
+ .update_prog = tcx_link_update,
+ .show_fdinfo = tcx_link_fdinfo,
+ .fill_link_info = tcx_link_fill_info,
+};
+
+static int tcx_link_init(struct tcx_link *tcx,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog,
+ attr->link_create.attach_type);
+ tcx->dev = dev;
+ return bpf_link_prime(&tcx->link, link_primer);
+}
+
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct net_device *dev;
+ struct tcx_link *tcx;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ tcx = kzalloc(sizeof(*tcx), GFP_USER);
+ if (!tcx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = tcx_link_init(tcx, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(tcx);
+ goto out;
+ }
+ ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags,
+ attr->link_create.tcx.relative_fd,
+ attr->link_create.tcx.expected_revision);
+ if (ret) {
+ tcx->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 3d7127f439a1..f8e70e9c3998 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b)
return TNUM(dv & ~mu, mu);
}
+struct tnum tnum_neg(struct tnum a)
+{
+ return tnum_sub(TNUM(0, 0), a);
+}
+
struct tnum tnum_and(struct tnum a, struct tnum b)
{
u64 alpha, beta, v;
@@ -111,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* Generate partial products by multiplying each bit in the multiplier (tnum a)
- * with the multiplicand (tnum b), and add the partial products after
- * appropriately bit-shifting them. Instead of directly performing tnum addition
- * on the generated partial products, equivalenty, decompose each partial
- * product into two tnums, consisting of the value-sum (acc_v) and the
- * mask-sum (acc_m) and then perform tnum addition on them. The following paper
- * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
+/* Perform long multiplication, iterating through the bits in a using rshift:
+ * - if LSB(a) is a known 0, keep current accumulator
+ * - if LSB(a) is a known 1, add b to current accumulator
+ * - if LSB(a) is unknown, take a union of the above cases.
+ *
+ * For example:
+ *
+ * acc_0: acc_1:
+ *
+ * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1
+ * x1 01 11
+ * ------ ------ ------
+ * 11 11 11
+ * xx 00 11
+ * ------ ------ ------
+ * ???? 0011 1001
*/
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- u64 acc_v = a.value * b.value;
- struct tnum acc_m = TNUM(0, 0);
+ struct tnum acc = TNUM(0, 0);
while (a.value || a.mask) {
/* LSB of tnum a is a certain 1 */
if (a.value & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ acc = tnum_add(acc, b);
/* LSB of tnum a is uncertain */
- else if (a.mask & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ else if (a.mask & 1) {
+ /* acc = tnum_union(acc_0, acc_1), where acc_0 and
+ * acc_1 are partial accumulators for cases
+ * LSB(a) = certain 0 and LSB(a) = certain 1.
+ * acc_0 = acc + 0 * b = acc.
+ * acc_1 = acc + 1 * b = tnum_add(acc, b).
+ */
+
+ acc = tnum_union(acc, tnum_add(acc, b));
+ }
/* Note: no case for LSB is certain 0 */
a = tnum_rshift(a, 1);
b = tnum_lshift(b, 1);
}
- return tnum_add(TNUM(acc_v, 0), acc_m);
+ return acc;
+}
+
+bool tnum_overlap(struct tnum a, struct tnum b)
+{
+ u64 mu;
+
+ mu = ~a.mask & ~b.mask;
+ return (a.value & mu) == (b.value & mu);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
@@ -150,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
+/* Returns a tnum with the uncertainty from both a and b, and in addition, new
+ * uncertainty at any position that a and b disagree. This represents a
+ * superset of the union of the concrete sets of both a and b. Despite the
+ * overapproximation, it is optimal.
+ */
+struct tnum tnum_union(struct tnum a, struct tnum b)
+{
+ u64 v = a.value & b.value;
+ u64 mu = (a.value ^ b.value) | a.mask | b.mask;
+
+ return TNUM(v & ~mu, mu);
+}
+
struct tnum tnum_cast(struct tnum a, u8 size)
{
a.value &= (1ULL << (size * 8)) - 1;
@@ -172,12 +214,6 @@ bool tnum_in(struct tnum a, struct tnum b)
return a.value == b.value;
}
-int tnum_strn(char *str, size_t size, struct tnum a)
-{
- return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
-}
-EXPORT_SYMBOL_GPL(tnum_strn);
-
int tnum_sbin(char *str, size_t size, struct tnum a)
{
size_t n;
@@ -208,7 +244,12 @@ struct tnum tnum_clear_subreg(struct tnum a)
return tnum_lshift(tnum_rshift(a, 32), 32);
}
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+ return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
- return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+ return tnum_with_subreg(a, tnum_const(value));
}
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..feecd8f4dbf9
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,261 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+#include <linux/security.h>
+
+static bool bpf_ns_capable(struct user_namespace *ns, int cap)
+{
+ return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
+}
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+ struct user_namespace *userns;
+
+ /* BPF token allows ns_capable() level of capabilities */
+ userns = token ? token->userns : &init_user_ns;
+ if (!bpf_ns_capable(userns, cap))
+ return false;
+ if (token && security_bpf_token_capable(token, cap) < 0)
+ return false;
+ return true;
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+ atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+ security_bpf_token_free(token);
+ put_user_ns(token->userns);
+ kfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+ struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+ bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+ if (!token)
+ return;
+
+ if (!atomic64_dec_and_test(&token->refcnt))
+ return;
+
+ INIT_WORK(&token->work, bpf_token_put_deferred);
+ schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+
+ bpf_token_put(token);
+ return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+ u64 mask;
+
+ BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+ mask = BIT_ULL(__MAX_BPF_CMD) - 1;
+ if ((token->allowed_cmds & mask) == mask)
+ seq_printf(m, "allowed_cmds:\tany\n");
+ else
+ seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+ BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
+ if ((token->allowed_maps & mask) == mask)
+ seq_printf(m, "allowed_maps:\tany\n");
+ else
+ seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+ BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
+ if ((token->allowed_progs & mask) == mask)
+ seq_printf(m, "allowed_progs:\tany\n");
+ else
+ seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+ BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
+ if ((token->allowed_attachs & mask) == mask)
+ seq_printf(m, "allowed_attachs:\tany\n");
+ else
+ seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+const struct file_operations bpf_token_fops = {
+ .release = bpf_token_release,
+ .show_fdinfo = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+ struct bpf_token *token __free(kfree) = NULL;
+ struct bpf_mount_opts *mnt_opts;
+ struct user_namespace *userns;
+ struct inode *inode;
+ CLASS(fd, f)(attr->token_create.bpffs_fd);
+ struct path path;
+ struct super_block *sb;
+ umode_t mode;
+ int err;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ path = fd_file(f)->f_path;
+ sb = path.dentry->d_sb;
+
+ if (path.dentry != sb->s_root)
+ return -EINVAL;
+ if (sb->s_op != &bpf_super_ops)
+ return -EINVAL;
+ err = path_permission(&path, MAY_ACCESS);
+ if (err)
+ return err;
+
+ userns = sb->s_user_ns;
+ /*
+ * Enforce that creators of BPF tokens are in the same user
+ * namespace as the BPF FS instance. This makes reasoning about
+ * permissions a lot easier and we can always relax this later.
+ */
+ if (current_user_ns() != userns)
+ return -EPERM;
+ if (!ns_capable(userns, CAP_BPF))
+ return -EPERM;
+
+ /* Creating BPF token in init_user_ns doesn't make much sense. */
+ if (current_user_ns() == &init_user_ns)
+ return -EOPNOTSUPP;
+
+ mnt_opts = sb->s_fs_info;
+ if (mnt_opts->delegate_cmds == 0 &&
+ mnt_opts->delegate_maps == 0 &&
+ mnt_opts->delegate_progs == 0 &&
+ mnt_opts->delegate_attachs == 0)
+ return -ENOENT; /* no BPF token delegation is set up */
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ inode = bpf_get_inode(sb, NULL, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_token_iops;
+ inode->i_fop = &bpf_token_fops;
+ clear_nlink(inode); /* make sure it is unlinked */
+
+ FD_PREPARE(fdf, O_CLOEXEC,
+ alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
+ O_RDWR, &bpf_token_fops));
+ if (fdf.err)
+ return fdf.err;
+
+ token = kzalloc(sizeof(*token), GFP_USER);
+ if (!token)
+ return -ENOMEM;
+
+ atomic64_set(&token->refcnt, 1);
+
+ /* remember bpffs owning userns for future ns_capable() checks. */
+ token->userns = userns;
+ token->allowed_cmds = mnt_opts->delegate_cmds;
+ token->allowed_maps = mnt_opts->delegate_maps;
+ token->allowed_progs = mnt_opts->delegate_progs;
+ token->allowed_attachs = mnt_opts->delegate_attachs;
+
+ err = security_bpf_token_create(token, attr, &path);
+ if (err)
+ return err;
+
+ get_user_ns(token->userns);
+ fd_prepare_file(fdf)->private_data = no_free_ptr(token);
+ return fd_publish(fdf);
+}
+
+int bpf_token_get_info_by_fd(struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_token_info info;
+ u32 info_len = attr->info.info_len;
+
+ info_len = min_t(u32, info_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
+
+ info.allowed_cmds = token->allowed_cmds;
+ info.allowed_maps = token->allowed_maps;
+ info.allowed_progs = token->allowed_progs;
+ info.allowed_attachs = token->allowed_attachs;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_token *token;
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_token_fops)
+ return ERR_PTR(-EINVAL);
+
+ token = fd_file(f)->private_data;
+ bpf_token_inc(token);
+
+ return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+ if (!token)
+ return false;
+ if (!(token->allowed_cmds & BIT_ULL(cmd)))
+ return false;
+ return security_bpf_token_cmd(token, cmd) == 0;
+}
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+ if (!token || type >= __MAX_BPF_MAP_TYPE)
+ return false;
+
+ return token->allowed_maps & BIT_ULL(type);
+}
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+ return false;
+
+ return (token->allowed_progs & BIT_ULL(prog_type)) &&
+ (token->allowed_attachs & BIT_ULL(attach_type));
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..976d89011b15 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,8 +9,10 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
-#include <linux/module.h>
#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -27,34 +29,100 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
-bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
{
- enum bpf_attach_type eatype = prog->expected_attach_type;
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
- return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_MODIFY_RETURN;
-}
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
-void *bpf_jit_alloc_exec_page(void)
-{
- void *image;
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ return 0;
+ }
+
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
*/
- set_memory_x((long)image, 1);
- return image;
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
+}
+#endif
+
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type ptype = prog->type;
+
+ return (ptype == BPF_PROG_TYPE_TRACING &&
+ (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN)) ||
+ (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
- ksym->end = ksym->start + PAGE_SIZE;
+ ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -84,6 +152,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
@@ -97,62 +175,53 @@ out:
return tr;
}
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
+static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr)
{
- struct module *mod;
- int err = 0;
-
- preempt_disable();
- mod = __module_text_address((unsigned long) tr->func.addr);
- if (mod && !try_module_get(mod))
- err = -ENOENT;
- preempt_enable();
- tr->mod = mod;
- return err;
-}
+ enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL;
+ void *ip = tr->func.addr;
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
- module_put(tr->mod);
- tr->mod = NULL;
-}
+ if (!new_addr)
+ new_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(tr->flags))
+ new_t = BPF_MOD_JUMP;
-static int is_ftrace_location(void *ip)
-{
- long addr;
+ if (!old_addr)
+ old_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(orig_flags))
+ old_t = BPF_MOD_JUMP;
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
+ return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
}
-static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr)
{
- void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
- if (!ret)
- bpf_trampoline_module_put(tr);
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
{
- void *ip = tr->func.addr;
int ret;
- if (tr->func.ftrace_managed)
- ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
- else
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ } else {
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
+ new_addr);
+ }
return ret;
}
@@ -160,50 +229,61 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
-
- if (bpf_trampoline_module_get(tr))
- return -ENOENT;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
+ tr->func.ftrace_managed = true;
+ }
- if (tr->func.ftrace_managed)
- ret = register_ftrace_direct((long)ip, (long)new_addr);
- else
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ if (tr->func.ftrace_managed) {
+ ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ if (ret)
+ return ret;
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ } else {
+ ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
+ }
- if (ret)
- bpf_trampoline_module_put(tr);
return ret;
}
-static struct bpf_tramp_progs *
+static struct bpf_tramp_links *
bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
- const struct bpf_prog_aux *aux;
- struct bpf_tramp_progs *tprogs;
- struct bpf_prog **progs;
+ struct bpf_tramp_link *link;
+ struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_link **links;
int kind;
*total = 0;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return ERR_PTR(-ENOMEM);
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- tprogs[kind].nr_progs = tr->progs_cnt[kind];
+ tlinks[kind].nr_links = tr->progs_cnt[kind];
*total += tr->progs_cnt[kind];
- progs = tprogs[kind].progs;
+ links = tlinks[kind].links;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
- *ip_arg |= aux->prog->call_get_func_ip;
- *progs++ = aux->prog;
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= link->link.prog->call_get_func_ip;
+ *links++ = link;
}
}
- return tprogs;
+ return tlinks;
+}
+
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
+{
+ bpf_image_ksym_del(&im->ksym);
+ arch_free_bpf_trampoline(im->image, im->size);
+ bpf_jit_uncharge_modmem(im->size);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
}
static void __bpf_tramp_image_put_deferred(struct work_struct *work)
@@ -211,11 +291,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
struct bpf_tramp_image *im;
im = container_of(work, struct bpf_tramp_image, work);
- bpf_image_ksym_del(&im->ksym);
- bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
- percpu_ref_exit(&im->pcref);
- kfree_rcu(im, rcu);
+ bpf_tramp_image_free(im);
}
/* callback, fexit step 3 or fentry step 2 */
@@ -280,10 +356,11 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
* call_rcu_tasks() is not necessary.
*/
if (im->ip_after_call) {
- int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
- NULL, im->ip_epilogue);
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ im->ip_epilogue);
WARN_ON(err);
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
else
percpu_ref_kill(&im->pcref);
@@ -299,7 +376,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
{
struct bpf_tramp_image *im;
struct bpf_ksym *ksym;
@@ -310,12 +387,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(size);
if (err)
goto out_free_im;
+ im->size = size;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec_page();
+ im->image = image = arch_alloc_bpf_trampoline(size);
if (!image)
goto out_uncharge;
@@ -325,76 +403,151 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
- bpf_image_ksym_add(image, ksym);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
+ bpf_image_ksym_init(image, size, ksym);
+ bpf_image_ksym_add(ksym);
return im;
out_free_image:
- bpf_jit_free_exec(im->image);
+ arch_free_bpf_trampoline(im->image, im->size);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(size);
out_free_im:
kfree(im);
out:
return ERR_PTR(err);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
{
struct bpf_tramp_image *im;
- struct bpf_tramp_progs *tprogs;
- u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ struct bpf_tramp_links *tlinks;
+ u32 orig_flags = tr->flags;
bool ip_arg = false;
- int err, total;
+ int err, total, size;
- tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
- if (IS_ERR(tprogs))
- return PTR_ERR(tprogs);
+ tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tlinks))
+ return PTR_ERR(tlinks);
if (total == 0) {
- err = unregister_fentry(tr, tr->cur_image->image);
+ err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
- tr->selector = 0;
goto out;
}
- im = bpf_tramp_image_alloc(tr->key, tr->selector);
- if (IS_ERR(im)) {
- err = PTR_ERR(im);
+ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
+
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
+
+ if (ip_arg)
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if (tr->flags & BPF_TRAMP_F_CALL_ORIG) {
+ if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) {
+ /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the
+ * first try, reset it in the second try.
+ */
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME;
+ } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) {
+ /* Use "jmp" instead of "call" for the trampoline
+ * in the origin call case, and we don't need to
+ * skip the frame.
+ */
+ tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME;
+ }
+ }
+#endif
+
+ size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+ tlinks, tr->func.addr);
+ if (size < 0) {
+ err = size;
goto out;
}
- if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
- tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
- flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ if (size > PAGE_SIZE) {
+ err = -E2BIG;
+ goto out;
+ }
- if (ip_arg)
- flags |= BPF_TRAMP_F_IP_ARG;
+ im = bpf_tramp_image_alloc(tr->key, size);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
- err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
- &tr->func.model, flags, tprogs,
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
+ &tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
- goto out;
+ goto out_free;
+
+ err = arch_protect_bpf_trampoline(im->image, im->size);
+ if (err)
+ goto out_free;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
- WARN_ON(tr->cur_image && tr->selector == 0);
- WARN_ON(!tr->cur_image && tr->selector);
+ WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image);
+ err = modify_fentry(tr, orig_flags, tr->cur_image->image,
+ im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ bpf_tramp_image_free(im);
+ goto again;
+ }
+#endif
if (err)
- goto out;
+ goto out_free;
+
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
- tr->selector++;
out:
- kfree(tprogs);
+ /* If any error happens, restore previous flags */
+ if (err) {
+ tr->flags = orig_flags;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+ }
+ kfree(tlinks);
return err;
+
+out_free:
+ bpf_tramp_image_free(im);
+ goto out;
}
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
@@ -419,77 +572,299 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ guard(mutex)(&aux->ext_mutex);
+ if (aux->prog_array_member_cnt)
+ /* Program extensions can not extend target prog when the target
+ * prog has been updated to any prog_array map as tail callee.
+ * It's to prevent a potential infinite loop like:
+ * tgt prog entry -> tgt prog subprog -> freplace prog entry
+ * --tailcall-> tgt prog entry.
+ */
+ return -EBUSY;
+
+ aux->is_extended = true;
+ return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_link *link_exiting;
int err = 0;
- int cnt;
+ int cnt = 0, i;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
- if (tr->extension_prog) {
+ kind = bpf_attach_type_to_tramp(link->link.prog);
+ if (tr->extension_prog)
/* cannot attach fentry/fexit if extension prog is attached.
* cannot overwrite extension prog either.
*/
- err = -EBUSY;
- goto out;
- }
- cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ return -EBUSY;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
- if (cnt) {
- err = -EBUSY;
- goto out;
- }
- tr->extension_prog = prog;
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- prog->bpf_func);
- goto out;
- }
- if (cnt >= BPF_MAX_TRAMP_PROGS) {
- err = -E2BIG;
- goto out;
+ if (cnt)
+ return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
+ tr->extension_prog = link->link.prog;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
}
- if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
- err = -EBUSY;
- goto out;
+ return -EBUSY;
+ hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ if (link_exiting->link.prog != link->link.prog)
+ continue;
+ /* prog already linked */
+ return -EBUSY;
}
- hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+
+ hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(tr);
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
-out:
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
int err;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
+ kind = bpf_attach_type_to_tramp(link->link.prog);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ BPF_MOD_NOP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
- goto out;
+ guard(mutex)(&tgt_prog->aux->ext_mutex);
+ tgt_prog->aux->is_extended = false;
+ return err;
}
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(tr);
-out:
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p, attach_type);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
+ mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+ return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
@@ -512,16 +887,19 @@ out:
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ int i;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
if (!refcount_dec_and_test(&tr->refcnt))
goto out;
WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
- goto out;
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
- goto out;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+ goto out;
+
/* This code will be executed even when the last bpf_tramp_image
* is alive. All progs are detached from the trampoline and the
* trampoline image is patched with jmp into epilogue to skip
@@ -529,6 +907,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* multiple rcu callbacks.
*/
hlist_del(&tr->hlist);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
@@ -547,16 +929,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
return start;
}
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
- struct bpf_prog_stats *stats;
-
- stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
- u64_stats_inc(&stats->misses);
- u64_stats_update_end(&stats->syncp);
-}
-
/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
@@ -570,69 +942,156 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- rcu_read_lock();
- migrate_disable();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
}
-static void notrace update_prog_stats(struct bpf_prog *prog,
- u64 start)
+static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
{
struct bpf_prog_stats *stats;
+ unsigned long flags;
+ u64 duration;
- if (static_branch_unlikely(&bpf_stats_enabled_key) &&
- /* static_key could be enabled in __bpf_prog_enter*
- * and disabled in __bpf_prog_exit*.
- * And vice versa.
- * Hence check that 'start' is valid.
- */
- start > NO_START_TIME) {
- unsigned long flags;
-
- stats = this_cpu_ptr(prog->stats);
- flags = u64_stats_update_begin_irqsave(&stats->syncp);
- u64_stats_inc(&stats->cnt);
- u64_stats_add(&stats->nsecs, sched_clock() - start);
- u64_stats_update_end_irqrestore(&stats->syncp, flags);
- }
+ /*
+ * static_key could be enabled in __bpf_prog_enter* and disabled in
+ * __bpf_prog_exit*. And vice versa. Check that 'start' is valid.
+ */
+ if (start <= NO_START_TIME)
+ return;
+
+ duration = sched_clock() - start;
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, duration);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+ u64 start)
+{
+ if (static_branch_unlikely(&bpf_stats_enabled_key))
+ __update_prog_stats(prog, start);
+}
+
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
- migrate_enable();
- rcu_read_unlock();
+ this_cpu_dec(*(prog->active));
+ rcu_read_unlock_migrate();
+}
+
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ rcu_read_unlock_migrate();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ rcu_read_unlock_migrate();
+}
+
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
percpu_ref_get(&tr->pcref);
@@ -643,11 +1102,74 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
percpu_ref_put(&tr->pcref);
}
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
-arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_progs *tprogs,
- void *orig_call)
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+ void *image;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE))
+ return NULL;
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (image)
+ set_vm_flush_reset_perms(image);
+ return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ /* bpf_jit_free_exec doesn't need "size", but
+ * bpf_prog_pack_free() needs it.
+ */
+ bpf_jit_free_exec(image);
+}
+
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ return set_memory_rox((long)image, 1);
+}
+
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a39eedecc93a..f0ca69f888fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,13 @@
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
+#include <linux/poison.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
+#include <net/xdp.h>
+#include <linux/trace_events.h>
+#include <linux/kallsyms.h>
#include "disasm.h"
@@ -37,6 +44,15 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#undef BPF_LINK_TYPE
};
+enum bpf_features {
+ BPF_FEAT_RDONLY_CAST_TO_VOID = 0,
+ BPF_FEAT_STREAMS = 1,
+ __MAX_BPF_FEAT,
+};
+
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -164,7 +180,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
- /* verifer state is 'st'
+ /* verifier state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
@@ -182,28 +198,37 @@ struct bpf_verifier_stack_elem {
#define BPF_MAP_KEY_POISON (1ULL << 63)
#define BPF_MAP_KEY_SEEN (1ULL << 62)
-#define BPF_MAP_PTR_UNPRIV 1UL
-#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
- POISON_POINTER_DELTA))
-#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
- return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
+ return aux->map_ptr_state.poison;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
- return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
+ return aux->map_ptr_state.unpriv;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
- const struct bpf_map *map, bool unpriv)
+ struct bpf_map *map,
+ bool unpriv, bool poison)
{
- BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
- aux->map_ptr_state = (unsigned long)map |
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+ aux->map_ptr_state.unpriv = unpriv;
+ aux->map_ptr_state.poison = poison;
+ aux->map_ptr_state.map_ptr = map;
}
static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
@@ -229,6 +254,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0;
+}
+
static bool bpf_pseudo_call(const struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_CALL) &&
@@ -245,11 +276,13 @@ struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
bool pkt_access;
+ u8 release_regno;
int regno;
int access_size;
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
int map_uid;
int func_id;
struct btf *btf;
@@ -257,396 +290,1038 @@ struct bpf_call_arg_meta {
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
+ struct btf_field *kptr_field;
+ s64 const_map_key;
+};
+
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+
+ /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
+ * generally to pass info about user-defined local kptr types to later
+ * verification logic
+ * bpf_obj_drop/bpf_percpu_obj_drop
+ * Record the local kptr type to be drop'd
+ * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+ * Record the local kptr type to be refcount_incr'd and use
+ * arg_owning_ref to determine whether refcount_acquire should be
+ * fallible
+ */
+ struct btf *arg_btf;
+ u32 arg_btf_id;
+ bool arg_owning_ref;
+ bool arg_prog;
+
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ u32 ref_obj_id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ struct {
+ struct bpf_map *ptr;
+ int uid;
+ } map;
+ u64 mem_size;
};
struct btf *btf_vmlinux;
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
- const struct bpf_line_info *linfo;
- const struct bpf_prog *prog;
- u32 i, nr_linfo;
+ struct bpf_verifier_env *env = private_data;
+ va_list args;
- prog = env->prog;
- nr_linfo = prog->aux->nr_linfo;
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
- if (!nr_linfo || insn_off >= prog->len)
- return NULL;
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
- linfo = prog->aux->linfo;
- for (i = 1; i < nr_linfo; i++)
- if (insn_off < linfo[i].insn_off)
- break;
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct bpf_retval_range range, const char *ctx,
+ const char *reg_name)
+{
+ bool unknown = true;
- return &linfo[i - 1];
+ verbose(env, "%s the register %s has", ctx, reg_name);
+ if (reg->smin_value > S64_MIN) {
+ verbose(env, " smin=%lld", reg->smin_value);
+ unknown = false;
+ }
+ if (reg->smax_value < S64_MAX) {
+ verbose(env, " smax=%lld", reg->smax_value);
+ unknown = false;
+ }
+ if (unknown)
+ verbose(env, " unknown scalar value");
+ verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
+static bool reg_not_null(const struct bpf_reg_state *reg)
{
- unsigned int n;
+ enum bpf_reg_type type;
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+ type = reg->type;
+ if (type_may_be_null(type))
+ return false;
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
+ type = base_type(type);
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
+ type == PTR_TO_SOCK_COMMON ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
+ (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
+ type == CONST_PTR_TO_MAP;
+}
- if (log->level == BPF_LOG_KERNEL) {
- bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
- pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
- return;
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
}
+ return rec;
+}
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
+
+ return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
- char zero = 0;
+ struct bpf_func_info *info;
- if (!bpf_verifier_log_needed(log))
- return;
+ if (!env->prog->aux->func_info)
+ return "";
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
+ info = &env->prog->aux->func_info[subprog];
+ return btf_type_name(env->prog->aux->btf, info->type_id);
}
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
- va_list args;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
+ struct bpf_subprog_info *info = subprog_info(env, subprog);
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
+ info->is_cb = true;
+ info->is_async_cb = true;
+ info->is_exception_cb = true;
}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
{
- struct bpf_verifier_env *env = private_data;
- va_list args;
+ return subprog_info(env, subprog)->is_exception_cb;
+}
- if (!bpf_verifier_log_needed(&env->log))
- return;
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK);
+}
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
{
- va_list args;
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
- if (!bpf_verifier_log_needed(log))
- return;
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_ringbuf_reserve ||
+ func_id == BPF_FUNC_kptr_xchg)
+ return true;
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
+}
+
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_tcp_sock ||
+ func_id == BPF_FUNC_sk_fullsock ||
+ func_id == BPF_FUNC_skc_to_tcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp6_sock ||
+ func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_mptcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
-static const char *ltrim(const char *s)
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
{
- while (isspace(*s))
- s++;
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_task_work_add_kfunc(u32 func_id);
- return s;
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
}
-__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
- u32 insn_off,
- const char *prefix_fmt, ...)
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
{
- const struct bpf_line_info *linfo;
+ return func_id == BPF_FUNC_timer_set_callback;
+}
- if (!bpf_verifier_log_needed(&env->log))
- return;
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
+}
- linfo = find_linfo(env, insn_off);
- if (!linfo || linfo == env->prev_linfo)
- return;
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
- if (prefix_fmt) {
- va_list args;
+static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
+}
- va_start(args, prefix_fmt);
- bpf_verifier_vlog(&env->log, prefix_fmt, args);
- va_end(args);
- }
+static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ /* bpf_timer callbacks are never sleepable. */
+ if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback)
+ return false;
- verbose(env, "%s\n",
- ltrim(btf_name_by_offset(env->prog->aux->btf,
- linfo->line_off)));
+ /* bpf_wq and bpf_task_work callbacks are always sleepable. */
+ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ return true;
- env->prev_linfo = linfo;
+ verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
+ return false;
}
-static void verbose_invalid_scalar(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- struct tnum *range, const char *ctx,
- const char *reg_name)
+static bool is_may_goto_insn(struct bpf_insn *insn)
{
- char tn_buf[48];
+ return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
- verbose(env, "At %s the register %s ", ctx, reg_name);
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
- }
- tnum_strn(tn_buf, sizeof(tn_buf), *range);
- verbose(env, " should have been in %s\n", tn_buf);
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
}
-static bool type_is_pkt_pointer(enum bpf_reg_type type)
+static bool is_storage_get_function(enum bpf_func_id func_id)
{
- return type == PTR_TO_PACKET ||
- type == PTR_TO_PACKET_META;
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
}
-static bool type_is_sk_pointer(enum bpf_reg_type type)
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCK_COMMON ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_XDP_SOCK;
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
}
-static bool reg_type_not_null(enum bpf_reg_type type)
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_MAP_VALUE ||
- type == PTR_TO_MAP_KEY ||
- type == PTR_TO_SOCK_COMMON;
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static int __get_spi(s32 off)
{
- return base_type(type) == PTR_TO_SOCKET ||
- base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM;
+ return (-off - 1) / BPF_REG_SIZE;
}
-static bool type_is_rdonly_mem(u32 type)
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
{
- return type & MEM_RDONLY;
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
- return type == ARG_PTR_TO_SOCK_COMMON;
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static bool type_may_be_null(u32 type)
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
{
- return type & PTR_MAYBE_NULL;
+ int off, spi;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ return -ERANGE;
+ return spi;
}
-/* Determine whether the function releases some resources allocated by another
- * function call. The first reference type argument will be assumed to be
- * released by release_reference().
- */
-static bool is_release_function(enum bpf_func_id func_id)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- return func_id == BPF_FUNC_sk_release ||
- func_id == BPF_FUNC_ringbuf_submit ||
- func_id == BPF_FUNC_ringbuf_discard;
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
-static bool is_acquire_function(enum bpf_func_id func_id,
- const struct bpf_map *map)
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+ return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
- if (func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_ringbuf_reserve)
- return true;
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ return BPF_DYNPTR_TYPE_LOCAL;
+ case DYNPTR_TYPE_RINGBUF:
+ return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_SKB_META:
+ return BPF_DYNPTR_TYPE_SKB_META;
+ case DYNPTR_TYPE_FILE:
+ return BPF_DYNPTR_TYPE_FILE;
+ default:
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+}
- if (func_id == BPF_FUNC_map_lookup_elem &&
- (map_type == BPF_MAP_TYPE_SOCKMAP ||
- map_type == BPF_MAP_TYPE_SOCKHASH))
- return true;
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return DYNPTR_TYPE_SKB_META;
+ case BPF_DYNPTR_TYPE_FILE:
+ return DYNPTR_TYPE_FILE;
+ default:
+ return 0;
+ }
+}
- return false;
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+ return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
}
-static bool is_ptr_cast_function(enum bpf_func_id func_id)
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
{
- return func_id == BPF_FUNC_tcp_sock ||
- func_id == BPF_FUNC_sk_fullsock ||
- func_id == BPF_FUNC_skc_to_tcp_sock ||
- func_id == BPF_FUNC_skc_to_tcp6_sock ||
- func_id == BPF_FUNC_skc_to_udp6_sock ||
- func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
- func_id == BPF_FUNC_skc_to_tcp_request_sock;
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
}
-static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- insn->imm == BPF_CMPXCHG;
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}
-/* string representation of 'enum bpf_reg_type'
- *
- * Note that reg_type_str() can not appear more than once in a single verbose()
- * statement.
- */
-static const char *reg_type_str(struct bpf_verifier_env *env,
- enum bpf_reg_type type)
-{
- char postfix[16] = {0}, prefix[16] = {0};
- static const char * const str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_BUF] = "buf",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
- };
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type type;
+ int spi, i, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_DYNPTR;
+ state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+ }
+
+ type = arg_to_dynptr_type(arg_type);
+ if (type == BPF_DYNPTR_TYPE_INVALID)
+ return -EINVAL;
- if (type & PTR_MAYBE_NULL) {
- if (base_type(type) == PTR_TO_BTF_ID ||
- base_type(type) == PTR_TO_PERCPU_BTF_ID)
- strncpy(postfix, "or_null_", 16);
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
+
+ if (dynptr_type_refcounted(type)) {
+ /* The id is used to track proper releasing */
+ int id;
+
+ if (clone_ref_obj_id)
+ id = clone_ref_obj_id;
else
- strncpy(postfix, "_or_null", 16);
+ id = acquire_reference(env, insn_idx);
+
+ if (id < 0)
+ return id;
+
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
- if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 16);
- if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 16);
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
- snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
- prefix, str[base_type(type)], postfix);
- return env->type_str_buf;
+ return 0;
}
-static char slot_type_char[] = {
- [STACK_INVALID] = '?',
- [STACK_SPILL] = 'r',
- [STACK_MISC] = 'm',
- [STACK_ZERO] = '0',
-};
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
+{
+ int i;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-static void print_liveness(struct bpf_verifier_env *env,
- enum bpf_reg_liveness live)
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
- verbose(env, "_");
- if (live & REG_LIVE_READ)
- verbose(env, "r");
- if (live & REG_LIVE_WRITTEN)
- verbose(env, "w");
- if (live & REG_LIVE_DONE)
- verbose(env, "D");
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ref_obj_id, i;
+
+ /*
+ * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
+ return -EFAULT;
+ }
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ invalidate_dynptr(env, state, spi);
+ return 0;
+ }
+
+ ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+ /* If the dynptr has a ref_obj_id, then we need to invalidate
+ * two things:
+ *
+ * 1) Any dynptrs with a matching ref_obj_id (clones)
+ * 2) Any slices derived from this dynptr.
+ */
+
+ /* Invalidate any slices associated with this dynptr */
+ WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+ /* Invalidate any dynptr clones */
+ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+ continue;
+
+ /* it should always be the case that if the ref obj id
+ * matches then the stack slot also belongs to a
+ * dynptr
+ */
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+ verifier_bug(env, "misconfigured ref_obj_id");
+ return -EFAULT;
+ }
+ if (state->stack[i].spilled_ptr.dynptr.first_slot)
+ invalidate_dynptr(env, state, i);
+ }
+
+ return 0;
}
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *cur = env->cur_state;
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
- return cur->frame[reg->frameno];
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
+{
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
+
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
+
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+
+ return 0;
}
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
+ */
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
+ return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int i, spi;
+
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ return false;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+ return false;
+ }
+
+ return true;
}
-static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
{
- env->scratched_regs |= 1U << regno;
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi;
+
+ /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+ if (arg_type == ARG_PTR_TO_DYNPTR)
+ return true;
+
+ dynptr_type = arg_to_dynptr_type(arg_type);
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
}
-static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
{
- env->scratched_stack_slots |= 1ULL << spi;
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_kfunc_rcu_protected(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
+ }
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return -EINVAL;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (st->type & PTR_UNTRUSTED)
+ return -EPROTO;
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return -EINVAL;
+ if (i != 0 && st->ref_obj_id)
+ return -EINVAL;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return -EINVAL;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return -EINVAL;
+ }
+
+ return 0;
}
-static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ int kfunc_class)
{
- return (env->scratched_regs >> regno) & 1;
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, id;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_irq_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->ref_obj_id = id;
+ st->irq.kfunc_class = kfunc_class;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_IRQ_FLAG;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
}
-static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int kfunc_class)
{
- return (env->scratched_stack_slots >> regno) & 1;
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, err;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (st->irq.kfunc_class != kfunc_class) {
+ const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+ const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+
+ verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
+ flag_kfunc, used_kfunc);
+ return -EINVAL;
+ }
+
+ err = release_irq_state(env->cur_state, st->ref_obj_id);
+ WARN_ON_ONCE(err && err != -EACCES);
+ if (err) {
+ int insn_idx = 0;
+
+ for (int i = 0; i < env->cur_state->acquired_refs; i++) {
+ if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
+ insn_idx = env->cur_state->refs[i].insn_idx;
+ break;
+ }
+ }
+
+ verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
+ env->cur_state->active_irq_id, insn_idx);
+ return err;
+ }
+
+ __mark_reg_not_init(env, st);
+
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
}
-static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- return env->scratched_regs || env->scratched_stack_slots;
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ int spi, i;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = irq_flag_get_spi(env, reg);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ slot = &state->stack[spi];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] == STACK_IRQ_FLAG)
+ return false;
+ return true;
}
-static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- env->scratched_regs = 0U;
- env->scratched_stack_slots = 0ULL;
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return -EINVAL;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (!st->ref_obj_id)
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] != STACK_IRQ_FLAG)
+ return -EINVAL;
+ return 0;
}
-/* Used for printing the entire verifier state. */
-static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ * - irq flag state (STACK_IRQ_FLAG)
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
- env->scratched_regs = ~0U;
- env->scratched_stack_slots = ~0ULL;
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ case STACK_IRQ_FLAG:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -657,163 +1332,40 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack)
return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}
-static void scrub_spilled_slot(u8 *stype)
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
{
- if (*stype != STACK_INVALID)
- *stype = STACK_MISC;
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
}
-static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state,
- bool print_all)
+static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
{
- const struct bpf_reg_state *reg;
- enum bpf_reg_type t;
- int i;
-
- if (state->frameno)
- verbose(env, " frame%d:", state->frameno);
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- t = reg->type;
- if (t == NOT_INIT)
- continue;
- if (!print_all && !reg_scratched(env, i))
- continue;
- verbose(env, " R%d", i);
- print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str(env, t));
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
- tnum_is_const(reg->var_off)) {
- /* reg->off should be 0 for SCALAR_VALUE */
- verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
- if (base_type(t) == PTR_TO_BTF_ID ||
- base_type(t) == PTR_TO_PERCPU_BTF_ID)
- verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
- if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
- if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
- else if (base_type(t) == CONST_PTR_TO_MAP ||
- base_type(t) == PTR_TO_MAP_KEY ||
- base_type(t) == PTR_TO_MAP_VALUE)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
- if (tnum_is_const(reg->var_off)) {
- /* Typically an immediate SCALAR_VALUE, but
- * could be a pointer whose offset is too big
- * for reg->off
- */
- verbose(env, ",imm=%llx", reg->var_off.value);
- } else {
- if (reg->smin_value != reg->umin_value &&
- reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
- if (reg->smax_value != reg->umax_value &&
- reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
- if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
- if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
- if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
- }
- if (reg->s32_min_value != reg->smin_value &&
- reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
- if (reg->s32_max_value != reg->smax_value &&
- reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
- if (reg->u32_min_value != reg->umin_value &&
- reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
- if (reg->u32_max_value != reg->umax_value &&
- reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
- }
- verbose(env, ")");
- }
- }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- char types_buf[BPF_REG_SIZE + 1];
- bool valid = false;
- int j;
-
- for (j = 0; j < BPF_REG_SIZE; j++) {
- if (state->stack[i].slot_type[j] != STACK_INVALID)
- valid = true;
- types_buf[j] = slot_type_char[
- state->stack[i].slot_type[j]];
- }
- types_buf[BPF_REG_SIZE] = 0;
- if (!valid)
- continue;
- if (!print_all && !stack_slot_scratched(env, i))
- continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
- if (is_spilled_reg(&state->stack[i])) {
- reg = &state->stack[i].spilled_ptr;
- t = reg->type;
- verbose(env, "=%s", reg_type_str(env, t));
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
- verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
- verbose(env, "=%s", types_buf);
- }
- }
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
- }
- if (state->in_callback_fn)
- verbose(env, " cb");
- if (state->in_async_callback_fn)
- verbose(env, " async_cb");
- verbose(env, "\n");
- mark_verifier_state_clean(env);
+ return stack->slot_type[0] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
}
-static inline u32 vlog_alignment(u32 pos)
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
+ * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
+ * unnecessary as both are considered equivalent when loading data and pruning,
+ * in case of unprivileged mode it will be incorrect to allow reads of invalid
+ * slots.
+ */
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
- return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
- BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+ if (*stype == STACK_ZERO)
+ return;
+ if (*stype == STACK_INVALID)
+ return;
+ *stype = STACK_MISC;
}
-static void print_insn_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+static void scrub_spilled_slot(u8 *stype)
{
- if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
- /* remove new line character */
- bpf_vlog_reset(&env->log, env->prev_log_len - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
- } else {
- verbose(env, "%d:", env->insn_idx);
- }
- print_verifier_state(env, state, false);
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -825,6 +1377,8 @@ static void print_insn_state(struct bpf_verifier_env *env,
*/
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
+ size_t alloc_bytes;
+ void *orig = dst;
size_t bytes;
if (ZERO_OR_NULL_PTR(src))
@@ -833,11 +1387,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < bytes) {
- kfree(dst);
- dst = kmalloc_track_caller(bytes, flags);
- if (!dst)
- return NULL;
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
}
memcpy(dst, src, bytes);
@@ -852,12 +1406,19 @@ out:
*/
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
+ size_t alloc_size;
+ void *new_arr;
+
if (!new_n || old_n == new_n)
goto out;
- arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
- if (!arr)
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!new_arr) {
+ kfree(arr);
return NULL;
+ }
+ arr = new_arr;
if (new_n > old_n)
memset(arr + old_n * size, 0, (new_n - old_n) * size);
@@ -866,14 +1427,20 @@ out:
return arr ? arr : ZERO_SIZE_PTR;
}
-static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
- sizeof(struct bpf_reference_state), GFP_KERNEL);
+ sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT);
if (!dst->refs)
return -ENOMEM;
dst->acquired_refs = src->acquired_refs;
+ dst->active_locks = src->active_locks;
+ dst->active_preempt_locks = src->active_preempt_locks;
+ dst->active_rcu_locks = src->active_rcu_locks;
+ dst->active_irq_id = src->active_irq_id;
+ dst->active_lock_id = src->active_lock_id;
+ dst->active_lock_ptr = src->active_lock_ptr;
return 0;
}
@@ -882,7 +1449,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
size_t n = src->allocated_stack / BPF_REG_SIZE;
dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!dst->stack)
return -ENOMEM;
@@ -890,7 +1457,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
return 0;
}
-static int resize_reference_state(struct bpf_func_state *state, size_t n)
+static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
state->refs = realloc_array(state->refs, state->acquired_refs, n,
sizeof(struct bpf_reference_state));
@@ -901,9 +1468,16 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
return 0;
}
-static int grow_stack_state(struct bpf_func_state *state, int size)
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
- size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+ /* The stack size is always a multiple of BPF_REG_SIZE. */
+ size = round_up(size, BPF_REG_SIZE);
+ n = size / BPF_REG_SIZE;
if (old_n >= n)
return 0;
@@ -913,6 +1487,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size)
return -ENOMEM;
state->allocated_stack = size;
+
+ /* update known max for given subprogram */
+ if (env->subprog_info[state->subprogno].stack_depth < size)
+ env->subprog_info[state->subprogno].stack_depth = size;
+
return 0;
}
@@ -921,46 +1500,171 @@ static int grow_stack_state(struct bpf_func_state *state, int size)
* On success, returns a valid pointer id to associate with the register
* On failure, returns a negative errno.
*/
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
int new_ofs = state->acquired_refs;
- int id, err;
+ int err;
err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
- return err;
- id = ++env->id_gen;
- state->refs[new_ofs].id = id;
+ return NULL;
state->refs[new_ofs].insn_idx = insn_idx;
- return id;
+ return &state->refs[new_ofs];
+}
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_PTR;
+ s->id = ++env->id_gen;
+ return s->id;
+}
+
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = type;
+ s->id = id;
+ s->ptr = ptr;
+
+ state->active_locks++;
+ state->active_lock_id = id;
+ state->active_lock_ptr = ptr;
+ return 0;
+}
+
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_IRQ;
+ s->id = ++env->id_gen;
+
+ state->active_irq_id = s->id;
+ return s->id;
}
-/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int release_reference_state(struct bpf_func_state *state, int ptr_id)
+static void release_reference_state(struct bpf_verifier_state *state, int idx)
{
- int i, last_idx;
+ int last_idx;
+ size_t rem;
+ /* IRQ state requires the relative ordering of elements remaining the
+ * same, since it relies on the refs array to behave as a stack, so that
+ * it can detect out-of-order IRQ restore. Hence use memmove to shift
+ * the array instead of swapping the final element into the deleted idx.
+ */
last_idx = state->acquired_refs - 1;
+ rem = state->acquired_refs - idx - 1;
+ if (last_idx && idx != last_idx)
+ memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ return;
+}
+
+static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++)
+ if (state->refs[i].id == ptr_id)
+ return true;
+
+ return false;
+}
+
+static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
+{
+ void *prev_ptr = NULL;
+ u32 prev_id = 0;
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type == type && state->refs[i].id == id &&
+ state->refs[i].ptr == ptr) {
+ release_reference_state(state, i);
+ state->active_locks--;
+ /* Reassign active lock (id, ptr). */
+ state->active_lock_id = prev_id;
+ state->active_lock_ptr = prev_ptr;
+ return 0;
+ }
+ if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
+ prev_id = state->refs[i].id;
+ prev_ptr = state->refs[i].ptr;
+ }
+ }
+ return -EINVAL;
+}
+
+static int release_irq_state(struct bpf_verifier_state *state, int id)
+{
+ u32 prev_id = 0;
+ int i;
+
+ if (id != state->active_irq_id)
+ return -EACCES;
+
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].id == ptr_id) {
- if (last_idx && i != last_idx)
- memcpy(&state->refs[i], &state->refs[last_idx],
- sizeof(*state->refs));
- memset(&state->refs[last_idx], 0, sizeof(*state->refs));
- state->acquired_refs--;
+ if (state->refs[i].type != REF_TYPE_IRQ)
+ continue;
+ if (state->refs[i].id == id) {
+ release_reference_state(state, i);
+ state->active_irq_id = prev_id;
return 0;
+ } else {
+ prev_id = state->refs[i].id;
}
}
return -EINVAL;
}
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
+ int id, void *ptr)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ struct bpf_reference_state *s = &state->refs[i];
+
+ if (!(s->type & type))
+ continue;
+
+ if (s->id == id && s->ptr == ptr)
+ return s;
+ }
+ return NULL;
+}
+
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
return;
- kfree(state->refs);
kfree(state->stack);
kfree(state);
}
@@ -981,23 +1685,50 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
+ kfree(state->refs);
clear_jmp_history(state);
if (free_self)
kfree(state);
}
+/* struct bpf_verifier_state->parent refers to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st);
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ if (!sl->in_free_list
+ || sl->state.branches != 0
+ || incomplete_read_marks(env, &sl->state))
+ return;
+ list_del(&sl->node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+}
+
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
static int copy_func_state(struct bpf_func_state *dst,
const struct bpf_func_state *src)
{
- int err;
-
- memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
- err = copy_reference_state(dst, src);
- if (err)
- return err;
+ memcpy(dst, src, offsetof(struct bpf_func_state, stack));
return copy_stack_state(dst, src);
}
@@ -1008,28 +1739,38 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
int i, err;
dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
- GFP_USER);
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_KERNEL_ACCOUNT);
if (!dst_state->jmp_history)
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
+ err = copy_reference_state(dst_state, src);
+ if (err)
+ return err;
dst_state->speculative = src->speculative;
+ dst_state->in_sleepable = src->in_sleepable;
+ dst_state->cleaned = src->cleaned;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
+ dst_state->may_goto_depth = src->may_goto_depth;
+ dst_state->equal_state = src->equal_state;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT);
if (!dst)
return -ENOMEM;
dst_state->frame[i] = dst;
@@ -1041,21 +1782,291 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
-static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+ int fr;
+
+ if (a->curframe != b->curframe)
+ return false;
+
+ for (fr = a->curframe; fr >= 0; fr--)
+ if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+ return false;
+
+ return true;
+}
+
+/* Return IP for a given frame in a call stack */
+static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
+{
+ return frame == st->curframe
+ ? st->insn_idx
+ : st->frame[frame + 1]->callsite;
+}
+
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
+ *
+ * void foo() { A: loop {... SCC#1 ...}; }
+ * void bar() { B: loop { C: foo(); ... SCC#2 ... }
+ * D: loop { E: foo(); ... SCC#3 ... } }
+ * void main() { F: bar(); }
+ *
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
+ */
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_callchain *callchain)
+{
+ u32 i, scc, insn_idx;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= st->curframe; i++) {
+ insn_idx = frame_insn_idx(st, i);
+ scc = env->insn_aux_data[insn_idx].scc;
+ if (scc) {
+ callchain->scc = scc;
+ break;
+ } else if (i < st->curframe) {
+ callchain->callsites[i] = insn_idx;
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_info *info = env->scc_info[callchain->scc];
+ struct bpf_scc_visit *visits = info->visits;
+ u32 i;
+
+ if (!info)
+ return NULL;
+ for (i = 0; i < info->num_visits; i++)
+ if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+ return &visits[i];
+ return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_visit *visit;
+ struct bpf_scc_info *info;
+ u32 scc, num_visits;
+ u64 new_sz;
+
+ scc = callchain->scc;
+ info = env->scc_info[scc];
+ num_visits = info ? info->num_visits : 0;
+ new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+ info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return NULL;
+ env->scc_info[scc] = info;
+ info->num_visits = num_visits + 1;
+ visit = &info->visits[num_visits];
+ memset(visit, 0, sizeof(*visit));
+ memcpy(&visit->callchain, callchain, sizeof(*callchain));
+ return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+ char *buf = env->tmp_str_buf;
+ int i, delta = 0;
+
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+ for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+ if (!callchain->callsites[i])
+ break;
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+ callchain->callsites[i]);
+ }
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+ return env->tmp_str_buf;
+}
+
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ visit = visit ?: scc_visit_alloc(env, callchain);
+ if (!visit)
+ return -ENOMEM;
+ if (!visit->entry_state) {
+ visit->entry_state = st;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
+ }
+ return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
+ }
+ if (visit->entry_state != st)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
+ visit->entry_state = NULL;
+ env->num_backedges -= visit->num_backedges;
+ visit->num_backedges = 0;
+ update_peak_states(env);
+ return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_backedge *backedge)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain)) {
+ verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+ st->insn_idx);
+ return -EFAULT;
+ }
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "add backedge: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
+ backedge->next = visit->backedges;
+ visit->backedges = backedge;
+ visit->num_backedges++;
+ env->num_backedges++;
+ update_peak_states(env);
+ return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return false;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit)
+ return false;
+ return !!visit->backedges;
+}
+
+static void free_backedges(struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge, *next;
+
+ for (backedge = visit->backedges; backedge; backedge = next) {
+ free_verifier_state(&backedge->state, false);
+ next = backedge->next;
+ kfree(backedge);
+ }
+ visit->backedges = NULL;
+}
+
+static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+ int err;
+
while (st) {
u32 br = --st->branches;
- /* WARN_ON(br > 1) technically makes sense here,
+ /* verifier_bug_if(br > 1, ...) technically makes sense here,
* but see comment in push_stack(), hence:
*/
- WARN_ONCE((int)br < 0,
- "BUG update_branch_counts:branches_to_explore=%d\n",
- br);
+ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
if (br)
break;
- st = st->parent;
+ err = maybe_exit_scc(env, st);
+ if (err)
+ return err;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
}
+ return 0;
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
@@ -1087,6 +2098,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
return 0;
}
+static bool error_recoverable_with_nospec(int err)
+{
+ /* Should only return true for non-fatal errors that are allowed to
+ * occur during speculative verification. For these we can insert a
+ * nospec and the program might still be accepted. Do not include
+ * something like ENOMEM because it is likely to re-occur for the next
+ * architectural path once it has been recovered-from in all speculative
+ * paths.
+ */
+ return err == -EPERM || err == -EACCES || err == -EINVAL;
+}
+
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
bool speculative)
@@ -1095,24 +2118,24 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
struct bpf_verifier_stack_elem *elem;
int err;
- elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- goto err;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
if (err)
- goto err;
+ return ERR_PTR(-ENOMEM);
elem->st.speculative |= speculative;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex.\n",
env->stack_size);
- goto err;
+ return ERR_PTR(-E2BIG);
}
if (elem->st.parent) {
++elem->st.parent->branches;
@@ -1127,12 +2150,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
*/
}
return &elem->st;
-err:
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- /* pop all elements and return */
- while (!pop_stack(env, NULL, NULL, false));
- return NULL;
}
#define CALLER_SAVED_REGS 6
@@ -1140,9 +2157,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
@@ -1163,9 +2177,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
*/
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
+ /* Clear off and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
___mark_reg_known(reg, imm);
}
@@ -1186,10 +2202,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
reg->type = SCALAR_VALUE;
+ /* all scalars are assumed imprecise initially (unless unprivileged,
+ * in which case everything is forced to be precise)
+ */
+ reg->precise = !env->bpf_capable;
}
static void mark_reg_known_zero(struct bpf_verifier_env *env,
@@ -1205,6 +2225,21 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -1216,8 +2251,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (map_value_has_timer(map->inner_map_meta))
+ if (btf_record_has_field(map->inner_map_meta->record,
+ BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
reg->map_uid = reg->id;
+ }
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1232,6 +2269,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
reg->type &= ~PTR_MAYBE_NULL;
}
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
+}
+
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
@@ -1243,6 +2290,13 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type &
+ (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META));
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -1324,69 +2378,290 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
- * are the same, so combine. This works even in the negative case, e.g.
- * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+ * bits to improve our u32/s32 boundaries.
+ *
+ * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+ * u64) is pretty trivial, it's obvious that in u32 we'll also have
+ * [10, 20] range. But this property holds for any 64-bit range as
+ * long as upper 32 bits in that entire range of values stay the same.
+ *
+ * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+ * in decimal) has the same upper 32 bits throughout all the values in
+ * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+ * range.
+ *
+ * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+ * following the rules outlined below about u64/s64 correspondence
+ * (which equally applies to u32 vs s32 correspondence). In general it
+ * depends on actual hexadecimal values of 32-bit range. They can form
+ * only valid u32, or only valid s32 ranges in some cases.
+ *
+ * So we use all these insights to derive bounds for subregisters here.
*/
- if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- return;
+ if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+ /* u64 to u32 casting preserves validity of low 32 bits as
+ * a range, if upper 32 bits are the same
+ */
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+ if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ }
+ if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+ /* low 32 bits should form a proper u32 range */
+ if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+ }
+ /* low 32 bits should form a proper s32 range */
+ if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ }
+ /* Special case where upper bits form a small sequence of two
+ * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+ * 0x00000000 is also valid), while lower bits form a proper s32 range
+ * going from negative numbers to positive numbers. E.g., let's say we
+ * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+ * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+ * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+ * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+ * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+ * upper 32 bits. As a random example, s64 range
+ * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+ * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
+ */
+ if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+ (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+ (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ /* if u32 range forms a valid s32 range (due to matching sign bit),
+ * try to learn from that
+ */
+ if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/
- if ((s32)reg->u32_max_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value;
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- } else if ((s32)reg->u32_min_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value;
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
+ reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
}
}
static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
+ /* If u64 range forms a valid s64 range (due to matching sign bit),
+ * try to learn from that. Let's do a bit of ASCII art to see when
+ * this is happening. Let's take u64 range first:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ *
+ * Valid u64 range is formed when umin and umax are anywhere in the
+ * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+ * straightforward. Let's see how s64 range maps onto the same range
+ * of values, annotated below the line for comparison:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * So s64 values basically start in the middle and they are logically
+ * contiguous to the right of it, wrapping around from -1 to 0, and
+ * then finishing as S64_MAX (0x7fffffffffffffff) right before
+ * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+ * more visually as mapped to sign-agnostic range of hex values.
+ *
+ * u64 start u64 end
+ * _______________________________________________________________
+ * / \
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ * / \
+ * >------------------------------ ------------------------------->
+ * s64 continues... s64 end s64 start s64 "midpoint"
+ *
+ * What this means is that, in general, we can't always derive
+ * something new about u64 from any random s64 range, and vice versa.
+ *
+ * But we can do that in two particular cases. One is when entire
+ * u64/s64 range is *entirely* contained within left half of the above
+ * diagram or when it is *entirely* contained in the right half. I.e.:
+ *
+ * |-------------------------------|--------------------------------|
+ * ^ ^ ^ ^
+ * A B C D
+ *
+ * [A, B] and [C, D] are contained entirely in their respective halves
+ * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+ * will be non-negative both as u64 and s64 (and in fact it will be
+ * identical ranges no matter the signedness). [C, D] treated as s64
+ * will be a range of negative values, while in u64 it will be
+ * non-negative range of values larger than 0x8000000000000000.
+ *
+ * Now, any other range here can't be represented in both u64 and s64
+ * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+ * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+ * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+ * for example. Similarly, valid s64 range [D, A] (going from negative
+ * to positive values), would be two separate [D, U64_MAX] and [0, A]
+ * ranges as u64. Currently reg_state can't represent two segments per
+ * numeric domain, so in such situations we can only derive maximal
+ * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+ *
+ * So we use these facts to derive umin/umax from smin/smax and vice
+ * versa only if they stay within the same "half". This is equivalent
+ * to checking sign bit: lower half will have sign bit as zero, upper
+ * half have sign bit 1. Below in code we simplify this by just
+ * casting umin/umax as smin/smax and checking if they form valid
+ * range, and vice versa. Those are equivalent checks.
+ */
+ if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+ reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+ reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
* are the same, so combine. This works even in the negative case, e.g.
* -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/
- if (reg->smin_value >= 0 || reg->smax_value < 0) {
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- return;
+ if ((u64)reg->smin_value <= (u64)reg->smax_value) {
+ reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
+ reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
+ } else {
+ /* If the s64 range crosses the sign boundary, then it's split
+ * between the beginning and end of the U64 domain. In that
+ * case, we can derive new bounds if the u64 range overlaps
+ * with only one end of the s64 range.
+ *
+ * In the following example, the u64 range overlaps only with
+ * positive portion of the s64 range.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * We can thus derive the following new s64 and u64 ranges.
+ *
+ * 0 U64_MAX
+ * | [xxxxxx u64 range xxxxx] |
+ * |----------------------------|----------------------------|
+ * | [xxxxxx s64 range xxxxx] |
+ * 0 S64_MAX S64_MIN -1
+ *
+ * If they overlap in two places, we can't derive anything
+ * because reg_state can't represent two ranges per numeric
+ * domain.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * The first condition below corresponds to the first diagram
+ * above.
+ */
+ if (reg->umax_value < (u64)reg->smin_value) {
+ reg->smin_value = (s64)reg->umin_value;
+ reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
+ } else if ((u64)reg->smax_value < reg->umin_value) {
+ /* This second condition considers the case where the u64 range
+ * overlaps with the negative portion of the s64 range:
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxxxxxx] [xxxxxxxxxxxx s64 range |
+ * 0 S64_MAX S64_MIN -1
+ */
+ reg->smax_value = (s64)reg->umax_value;
+ reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
+ }
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+}
+
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+ /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+ * values on both sides of 64-bit range in hope to have tighter range.
+ * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+ * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+ * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+ * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+ * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+ * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+ * We just need to make sure that derived bounds we are intersecting
+ * with are well-formed ranges in respective s64 or u64 domain, just
+ * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
- if ((s64)reg->umax_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->smin_value = reg->umin_value;
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- } else if ((s64)reg->umin_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value;
+ __u64 new_umin, new_umax;
+ __s64 new_smin, new_smax;
+
+ /* u32 -> u64 tightening, it's always well-formed */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+ /* Here we would like to handle a special case after sign extending load,
+ * when upper bits for a 64-bit range are all 1s or all 0s.
+ *
+ * Upper bits are all 1s when register is in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
+ * Upper bits are all 0s when register is in a range:
+ * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
+ * Together this forms are continuous range:
+ * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
+ *
+ * Now, suppose that register range is in fact tighter:
+ * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
+ * Also suppose that it's 32-bit range is positive,
+ * meaning that lower 32-bits of the full 64-bit register
+ * are in the range:
+ * [0x0000_0000, 0x7fff_ffff] (W)
+ *
+ * If this happens, then any value in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
+ * is smaller than a lowest bound of the range (R):
+ * 0xffff_ffff_8000_0000
+ * which means that upper bits of the full 64-bit register
+ * can't be all 1s, when lower bits are in range (W).
+ *
+ * Note that:
+ * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
+ * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
+ * These relations are used in the conditions below.
+ */
+ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ reg->umin_value = reg->s32_min_value;
+ reg->umax_value = reg->s32_max_value;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->smin_value, reg->smax_value));
}
}
@@ -1394,6 +2669,7 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
__reg32_deduce_bounds(reg);
__reg64_deduce_bounds(reg);
+ __reg_deduce_mixed_bounds(reg);
}
/* Attempts to improve var_off based on unsigned min/max information */
@@ -1402,13 +2678,80 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static void reg_bounds_sync(struct bpf_reg_state *reg)
+{
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(reg);
+}
+
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, const char *ctx)
+{
+ const char *msg;
+
+ if (reg->umin_value > reg->umax_value ||
+ reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value) {
+ msg = "range bounds violation";
+ goto out;
+ }
+
+ if (tnum_is_const(reg->var_off)) {
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ if (tnum_subreg_is_const(reg->var_off)) {
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
+ "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
+ ctx, msg, reg->umin_value, reg->umax_value,
+ reg->smin_value, reg->smax_value,
+ reg->u32_min_value, reg->u32_max_value,
+ reg->s32_min_value, reg->s32_max_value,
+ reg->var_off.value, reg->var_off.mask);
+ if (env->test_reg_invariants)
+ return -EFAULT;
+ __mark_reg_unbounded(reg);
+ return 0;
+}
+
static bool __reg32_bound_s64(s32 a)
{
return a >= 0 && a <= S32_MAX;
@@ -1433,83 +2776,33 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
}
-static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
-{
- /* special case when 64-bit register has upper 32-bit register
- * zeroed. Typically happens after zext or <<32, >>32 sequence
- * allowing us to use 32-bit bounds directly,
- */
- if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
- __reg_assign_32_into_64(reg);
- } else {
- /* Otherwise the best we can do is push lower 32bit known and
- * unknown bits into register (var_off set from jmp logic)
- * then learn as much as possible from the 64-bit tnum
- * known and unknown bits. The previous smin/smax bounds are
- * invalid here because of jmp32 compare so mark them unknown
- * so they do not impact tnum bounds calculation.
- */
- __mark_reg64_unbounded(reg);
- __update_reg_bounds(reg);
- }
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
-}
-
-static bool __reg64_bound_s32(s64 a)
-{
- return a >= S32_MIN && a <= S32_MAX;
-}
-
-static bool __reg64_bound_u32(u64 a)
-{
- return a >= U32_MIN && a <= U32_MAX;
-}
-
-static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
-{
- __mark_reg32_unbounded(reg);
-
- if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
- reg->s32_min_value = (s32)reg->smin_value;
- reg->s32_max_value = (s32)reg->smax_value;
- }
- if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
- reg->u32_min_value = (u32)reg->umin_value;
- reg->u32_max_value = (u32)reg->umax_value;
- }
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
-}
-
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
/*
- * Clear type, id, off, and union(map_ptr, range) and
+ * Clear type, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = false;
__mark_reg_unbounded(reg);
}
+/* Mark a register as having a completely unknown (scalar) value,
+ * initialize .precise as true when not bpf capable.
+ */
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown_imprecise(reg);
+ reg->precise = !env->bpf_capable;
+}
+
static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
@@ -1523,6 +2816,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_unknown(env, regs + regno);
}
+static int __mark_reg_s32_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ u32 regno,
+ s32 s32_min,
+ s32 s32_max)
+{
+ struct bpf_reg_state *reg = regs + regno;
+
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
+
+ reg->smin_value = max_t(s64, reg->smin_value, s32_min);
+ reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+
+ reg_bounds_sync(reg);
+
+ return reg_bounds_sanity_check(env, reg, "s32_range");
+}
+
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -1543,19 +2855,33 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(env, regs + regno);
}
-static void mark_btf_ld_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs, u32 regno,
- enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id)
+static int mark_btf_ld_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno,
+ enum bpf_reg_type reg_type,
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
- if (reg_type == SCALAR_VALUE) {
+ switch (reg_type) {
+ case SCALAR_VALUE:
mark_reg_unknown(env, regs, regno);
- return;
+ return 0;
+ case PTR_TO_BTF_ID:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_BTF_ID | flag;
+ regs[regno].btf = btf;
+ regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
+ return 0;
+ case PTR_TO_MEM:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_MEM | flag;
+ regs[regno].mem_size = 0;
+ return 0;
+ default:
+ verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__);
+ return -EFAULT;
}
- mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID;
- regs[regno].btf = btf;
- regs[regno].btf_id = btf_id;
}
#define DEF_NOT_SUBREG (0)
@@ -1567,8 +2893,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
- regs[i].live = REG_LIVE_NONE;
- regs[i].parent = NULL;
regs[i].subreg_def = DEF_NOT_SUBREG;
}
@@ -1578,6 +2902,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].frameno = state->frameno;
}
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+ return (struct bpf_retval_range){ minval, maxval };
+}
+
#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
@@ -1586,6 +2915,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = retval_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -1593,26 +2923,26 @@ static void init_func_state(struct bpf_verifier_env *env,
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
- int subprog)
+ int subprog, bool is_sleepable)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
- elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- goto err;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env,
"The sequence of %d jumps is too complex for async cb.\n",
env->stack_size);
- goto err;
+ return ERR_PTR(-E2BIG);
}
/* Unlike push_stack() do not copy_verifier_state().
* The caller state doesn't matter.
@@ -1620,21 +2950,16 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* Initialize it similar to do_check_common().
*/
elem->st.branches = 1;
- frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ elem->st.in_sleepable = is_sleepable;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT);
if (!frame)
- goto err;
+ return ERR_PTR(-ENOMEM);
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
elem->st.frame[0] = frame;
return &elem->st;
-err:
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- /* pop all elements and return */
- while (!pop_stack(env, NULL, NULL, false));
- return NULL;
}
@@ -1650,16 +2975,36 @@ static int cmp_subprogs(const void *a, const void *b)
((struct bpf_subprog_info *)b)->start;
}
+/* Find subprogram that contains instruction at 'off' */
+struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_info, env->subprog_cnt,
- sizeof(env->subprog_info[0]), cmp_subprogs);
- if (!p)
+ p = bpf_find_containing_subprog(env, off);
+ if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
-
}
static int add_subprog(struct bpf_verifier_env *env, int off)
@@ -1685,6 +3030,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+ return ret;
+ }
+ id = ret;
+ t = btf_type_by_id(btf, id);
+ if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+ verbose(env, "exception callback '%s' must have global linkage\n", name);
+ return -EINVAL;
+ }
+ ret = 0;
+ for (i = 0; i < aux->func_info_cnt; i++) {
+ if (aux->func_info[i].type_id != id)
+ continue;
+ ret = aux->func_info[i].insn_off;
+ /* Further func_info and subprog checks will also happen
+ * later, so assume this is the right insn_off for now.
+ */
+ if (!ret) {
+ verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+ ret = -EINVAL;
+ }
+ }
+ if (!ret) {
+ verbose(env, "exception callback type id not found in func_info\n");
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS 256
@@ -1693,6 +3100,7 @@ struct bpf_kfunc_desc {
u32 func_id;
s32 imm;
u16 offset;
+ unsigned long addr;
};
struct bpf_kfunc_btf {
@@ -1702,6 +3110,11 @@ struct bpf_kfunc_btf {
};
struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
@@ -1711,6 +3124,9 @@ struct bpf_kfunc_btf_tab {
u32 nr_descs;
};
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
+ int insn_idx);
+
static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
@@ -1728,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b)
return d0->offset - d1->offset;
}
-static const struct bpf_kfunc_desc *
+static struct bpf_kfunc_desc *
find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
struct bpf_kfunc_desc desc = {
@@ -1742,8 +3158,21 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
+ return 0;
+}
+
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
- s16 offset, struct module **btf_modp)
+ s16 offset)
{
struct bpf_kfunc_btf kf_btf = { .offset = offset };
struct bpf_kfunc_btf_tab *tab;
@@ -1794,12 +3223,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
b->module = mod;
b->offset = offset;
+ /* sort() reorders entries by value, so b may no longer point
+ * to the right entry after this
+ */
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
+ } else {
+ btf = b->btf;
}
- if (btf_modp)
- *btf_modp = b->module;
- return b->btf;
+
+ return btf;
}
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
@@ -1814,9 +3247,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
kfree(tab);
}
-static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
- u32 func_id, s16 offset,
- struct module **btf_modp)
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
{
if (offset) {
if (offset < 0) {
@@ -1827,7 +3258,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
return ERR_PTR(-EINVAL);
}
- return __find_kfunc_desc_btf(env, offset, btf_modp);
+ return __find_kfunc_desc_btf(env, offset);
}
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
@@ -1836,6 +3267,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
const struct btf_type *func, *func_proto;
struct bpf_kfunc_btf_tab *btf_tab;
+ struct btf_func_model func_model;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
struct bpf_kfunc_desc *desc;
@@ -1868,7 +3300,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
- tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT);
if (!tab)
return -ENOMEM;
prog_aux->kfunc_tab = tab;
@@ -1884,13 +3316,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return 0;
if (!btf_tab && offset) {
- btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT);
if (!btf_tab)
return -ENOMEM;
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+ desc_btf = find_kfunc_desc_btf(env, offset);
if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n");
return PTR_ERR(desc_btf);
@@ -1925,41 +3357,77 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
+ err = btf_distill_func_proto(&env->log, desc_btf,
+ func_proto, func_name,
+ &func_model);
+ if (err)
+ return err;
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CALL_IMM(addr);
desc->offset = offset;
- err = btf_distill_func_proto(&env->log, desc_btf,
- func_proto, func_name,
- &desc->func_model);
- if (!err)
- sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_id_off, NULL);
- return err;
+ desc->addr = addr;
+ desc->func_model = func_model;
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id_off, NULL);
+ return 0;
}
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
- if (d0->imm > d1->imm)
- return 1;
- else if (d0->imm < d1->imm)
- return -1;
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
return 0;
}
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
{
struct bpf_kfunc_desc_tab *tab;
+ int i, err;
- tab = prog->aux->kfunc_tab;
+ tab = env->prog->aux->kfunc_tab;
if (!tab)
- return;
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm, NULL);
+ kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -1973,22 +3441,38 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
{
const struct bpf_kfunc_desc desc = {
.imm = insn->imm,
+ .offset = insn->off,
};
const struct bpf_kfunc_desc *res;
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
return res ? &res->func_model : NULL;
}
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
+ int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
struct bpf_insn *insn = env->prog->insnsi;
- int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
@@ -2014,6 +3498,27 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return ret;
}
+ ret = bpf_find_exception_callback_insn_off(env);
+ if (ret < 0)
+ return ret;
+ ex_cb_insn = ret;
+
+ /* If ex_cb_insn > 0, this means that the main program has a subprog
+ * marked using BTF decl tag to serve as the exception callback.
+ */
+ if (ex_cb_insn) {
+ ret = add_subprog(env, ex_cb_insn);
+ if (ret < 0)
+ return ret;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start != ex_cb_insn)
+ continue;
+ env->exception_callback_subprog = i;
+ mark_subprog_exc_cb(env, i);
+ break;
+ }
+ }
+
/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
@@ -2040,17 +3545,23 @@ static int check_subprogs(struct bpf_verifier_env *env)
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
- insn[i].imm == BPF_FUNC_tail_call &&
- insn[i].src_reg != BPF_PSEUDO_CALL)
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
- if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+ if (BPF_OP(code) == BPF_CALL)
goto next;
- off = i + insn[i].off + 1;
+ if (BPF_OP(code) == BPF_EXIT) {
+ subprog[cur_subprog].exit_idx = i;
+ goto next;
+ }
+ off = i + bpf_jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -2059,9 +3570,10 @@ next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
- * or unconditional jump back
+ * or unconditional jump back or bpf_throw call
*/
if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP32 | BPF_JA) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
@@ -2075,62 +3587,61 @@ next:
return 0;
}
-/* Parentage chain of this register (or stack slot) should take care of all
- * issues like callee-saved registers, stack slot allocation time, etc.
- */
-static int mark_reg_read(struct bpf_verifier_env *env,
- const struct bpf_reg_state *state,
- struct bpf_reg_state *parent, u8 flag)
+static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
{
- bool writes = parent == state->parent; /* Observe write marks */
- int cnt = 0;
+ int err, i;
- while (parent) {
- /* if read wasn't screened by an earlier write ... */
- if (writes && state->live & REG_LIVE_WRITTEN)
- break;
- if (parent->live & REG_LIVE_DONE) {
- verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off);
- return -EFAULT;
- }
- /* The first condition is more likely to be true than the
- * second, checked it first.
- */
- if ((parent->live & REG_LIVE_READ) == flag ||
- parent->live & REG_LIVE_READ64)
- /* The parentage chain never changes and
- * this parent was already marked as LIVE_READ.
- * There is no need to keep walking the chain again and
- * keep re-marking all parents as LIVE_READ.
- * This case happens when the same register is read
- * multiple times without writes into it in-between.
- * Also, if parent has the stronger REG_LIVE_READ64 set,
- * then no need to set the weak REG_LIVE_READ32.
- */
- break;
- /* ... then we depend on parent's value */
- parent->live |= flag;
- /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
- if (flag == REG_LIVE_READ64)
- parent->live &= ~REG_LIVE_READ32;
- state = parent;
- parent = state->parent;
- writes = true;
- cnt++;
+ for (i = 0; i < nr_slots; i++) {
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
+ if (err)
+ return err;
+ mark_stack_slot_scratched(env, spi - i);
}
-
- if (env->longest_mark_read_walk < cnt)
- env->longest_mark_read_walk = cnt;
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
+}
+
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
+}
+
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return mark_stack_slot_obj_read(env, reg, spi, 1);
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
*/
-static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+static bool is_reg64(struct bpf_insn *insn,
u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
{
u8 code, class, op;
@@ -2162,8 +3673,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
+ return false;
+
if (class == BPF_ALU64 || class == BPF_JMP ||
- /* BPF_END always use BPF_ALU class. */
(class == BPF_ALU && op == BPF_END && insn->imm == 64))
return true;
@@ -2172,13 +3685,13 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (class == BPF_LDX) {
if (t != SRC_OP)
- return BPF_SIZE(code) == BPF_DW;
+ return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
/* LDX source must be ptr. */
return true;
}
if (class == BPF_STX) {
- /* BPF_STX (including atomic variants) has multiple source
+ /* BPF_STX (including atomic variants) has one or more source
* operands, one of which is a ptr. Check whether the caller is
* asking about it.
*/
@@ -2223,29 +3736,30 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC &&
- (insn->imm & BPF_FETCH)) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
- else
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
return insn->src_reg;
- } else {
- return -1;
}
+ return -1;
default:
return insn->dst_reg;
}
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
-static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static bool insn_has_def32(struct bpf_insn *insn)
{
int dst_reg = insn_def_regno(insn);
if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
+ return !is_reg64(insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -2261,13 +3775,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
reg->subreg_def = DEF_NOT_SUBREG;
}
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *reg;
bool rw64;
if (regno >= MAX_BPF_REG) {
@@ -2278,7 +3790,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
mark_reg_scratched(env, regno);
reg = &regs[regno];
- rw64 = is_reg64(env, insn, regno, reg, t);
+ rw64 = is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (reg->type == NOT_INIT) {
@@ -2292,15 +3804,13 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
if (rw64)
mark_insn_zext(env, reg);
- return mark_reg_read(env, reg, reg->parent,
- rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
+ return 0;
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n");
return -EACCES;
}
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
@@ -2308,32 +3818,194 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
+}
+
+static int insn_stack_access_flags(int frameno, int spi)
+{
+ return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
+#define LR_FRAMENO_BITS 3
+#define LR_SPI_BITS 6
+#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS 4
+#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1)
+#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
+#define LR_SPI_OFF LR_FRAMENO_BITS
+#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX 6
+
+struct linked_reg {
+ u8 frameno;
+ union {
+ u8 spi;
+ u8 regno;
+ };
+ bool is_reg;
+};
+
+struct linked_regs {
+ int cnt;
+ struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+ if (s->cnt < LINKED_REGS_MAX)
+ return &s->entries[s->cnt++];
+
+ return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+ u64 val = 0;
+ int i;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+ u64 tmp = 0;
+
+ tmp |= e->frameno;
+ tmp |= e->spi << LR_SPI_OFF;
+ tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+ val <<= LR_ENTRY_BITS;
+ val |= tmp;
+ }
+ val <<= LR_SIZE_BITS;
+ val |= s->cnt;
+ return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+ int i;
+
+ s->cnt = val & LR_SIZE_MASK;
+ val >>= LR_SIZE_BITS;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+
+ e->frameno = val & LR_FRAMENO_MASK;
+ e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+ e->is_reg = (val >> LR_IS_REG_OFF) & 0x1;
+ val >>= LR_ENTRY_BITS;
+ }
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
- struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
u32 cnt = cur->jmp_history_cnt;
- struct bpf_idx_pair *p;
+ struct bpf_jmp_history_entry *p;
+ size_t alloc_size;
+
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
+ return 0;
+ }
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
if (!p)
return -ENOMEM;
- p[cnt - 1].idx = env->insn_idx;
- p[cnt - 1].prev_idx = env->prev_insn_idx;
cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
+ p->linked_regs = linked_regs;
cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
return 0;
}
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
*/
static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
u32 *history)
{
u32 cnt = *history;
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
if (cnt && st->jmp_history[cnt - 1].idx == i) {
i = st->jmp_history[cnt - 1].prev_idx;
(*history)--;
@@ -2351,7 +4023,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+ desc_btf = find_kfunc_desc_btf(data, insn->off);
if (IS_ERR(desc_btf))
return "<error>";
@@ -2359,45 +4031,256 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
-/* For given verifier state backtrack_insn() is called from the last insn to
- * the first insn. Its purpose is to compute a bitmask of registers and
- * stack slots that needs precision in the parent verifier state.
- */
-static int backtrack_insn(struct bpf_verifier_env *env, int idx,
- u32 *reg_mask, u64 *stack_mask)
+static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
+
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+}
+
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verifier_bug(bt->env, "subprog exit from frame 0");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ return bt->reg_masks[frame] & (1 << reg);
+}
+
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ return bt->stack_masks[frame] & (1ull << slot);
+}
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
+ */
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+{
+ struct linked_regs linked_regs;
+ bool some_precise = false;
+ int i;
+
+ if (!hist || hist->linked_regs == 0)
+ return;
+
+ linked_regs_unpack(hist->linked_regs, &linked_regs);
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+ (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+ some_precise = true;
+ break;
+ }
+ }
+
+ if (!some_precise)
+ return;
+
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if (e->is_reg)
+ bt_set_frame_reg(bt, e->frameno, e->regno);
+ else
+ bt_set_frame_slot(bt, e->frameno, e->spi);
+ }
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+{
struct bpf_insn *insn = env->prog->insnsi + idx;
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
u8 mode = BPF_MODE(insn->code);
- u32 dreg = 1u << insn->dst_reg;
- u32 sreg = 1u << insn->src_reg;
- u32 spi;
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i, fr;
if (insn->code == 0)
return 0;
if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ verbose_insn(env, insn);
}
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bt_sync_linked_regs(bt, hist);
+
if (class == BPF_ALU || class == BPF_ALU64) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
return 0;
- if (opcode == BPF_MOV) {
+ } else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
* dreg needs precision after this insn
* sreg needs precision before this insn
*/
- *reg_mask &= ~dreg;
- *reg_mask |= sreg;
+ bt_clear_reg(bt, dreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -2405,7 +4288,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* as precise=true in this verifier state.
* No further markings in parent are necessary
*/
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
}
} else {
if (BPF_SRC(insn->code) == BPF_X) {
@@ -2413,15 +4296,16 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* both dreg and sreg need precision
* before this insn
*/
- *reg_mask |= sreg;
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
}
- } else if (class == BPF_LDX) {
- if (!(*reg_mask & dreg))
+ } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
@@ -2429,62 +4313,206 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* by 'precise' mark in corresponding register of this state.
* No further tracking necessary.
*/
- if (insn->src_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
-
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
* tracked with precision
*/
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- *stack_mask |= 1ull << spi;
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bt_set_frame_slot(bt, fr, spi);
} else if (class == BPF_STX || class == BPF_ST) {
- if (*reg_mask & dreg)
+ if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
*/
return -ENOTSUPP;
/* scalars can only be spilled into stack */
- if (insn->dst_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- if (!(*stack_mask & (1ull << spi)))
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
return 0;
- *stack_mask &= ~(1ull << spi);
+ bt_clear_frame_slot(bt, fr, spi);
if (class == BPF_STX)
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
- if (opcode == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
return -ENOTSUPP;
/* regular helper call sets R0 */
- *reg_mask &= ~1;
- if (*reg_mask & 0x3f) {
- /* if backtracing was looking for registers R1-R5
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracking was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", *reg_mask);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
} else if (opcode == BPF_EXIT) {
- return -ENOTSUPP;
+ bool r0_precise;
+
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* It's ld_imm64 or ld_abs or ld_ind.
* For ld_imm64 no further tracking of precision
* into parent is necessary
@@ -2493,6 +4521,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
/* to be analyzed */
return -ENOTSUPP;
}
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bt_sync_linked_regs(bt, hist);
return 0;
}
@@ -2504,7 +4536,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* . if (scalar cond K|scalar)
* . helper_call(.., scalar, ...) where ARG_CONST is expected
* backtrack through the verifier states and mark all registers and
- * stack slots with spilled constants that these scalar regisers
+ * stack slots with spilled constants that these scalar registers
* should be precise.
* . during state pruning two registers (or spilled stack slots)
* are equivalent if both are not precise.
@@ -2555,110 +4587,260 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
int i, j;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
}
}
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
- int spi)
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
- struct bpf_verifier_state *st = env->cur_state;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state,
+ int regno,
+ bool *changed)
+{
+ struct bpf_verifier_state *st = starting_state;
+ struct backtrack_state *bt = &env->bt;
int first_idx = st->first_insn_idx;
- int last_idx = env->insn_idx;
+ int last_idx = starting_state->insn_idx;
+ int subseq_idx = -1;
struct bpf_func_state *func;
+ bool tmp, skip_first = true;
struct bpf_reg_state *reg;
- u32 reg_mask = regno >= 0 ? 1u << regno : 0;
- u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
- bool skip_first = true;
- bool new_marks = false;
- int i, err;
+ int i, fr, err;
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ changed = changed ?: &tmp;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, starting_state->curframe);
+
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[bt->frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
- WARN_ONCE(1, "backtracing misuse");
+ verifier_bug(env, "backtracking misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
+ bt_set_reg(bt, regno);
}
- while (spi >= 0) {
- if (!is_spilled_reg(&func->stack[spi])) {
- stack_mask = 0;
- break;
- }
- reg = &func->stack[spi].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask = 0;
- break;
- }
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
- break;
- }
-
- if (!new_marks)
- return 0;
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE) {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ return 0;
+ }
+
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
+ return -EFAULT;
+ }
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, st);
+ mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
return 0;
} else if (err) {
return err;
}
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
/* Found assignment(s) into tracked register in this state.
* Since this state is already marked, just return.
* Nothing to be tracked further in the parent state.
*/
return 0;
- if (i == first_idx)
- break;
+ subseq_idx = i;
i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
* and there are still reg_mask or stack_mask
@@ -2666,8 +4848,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
* It means the backtracking missed the spot where
* particular register was initialized with a constant.
*/
- verbose(env, "BUG backtracking idx %d\n", i);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking idx %d", i);
return -EFAULT;
}
}
@@ -2675,79 +4856,86 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!st)
break;
- new_marks = false;
- func = st->frame[st->curframe];
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
- continue;
- }
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
-
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, st);
- return 0;
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise) {
+ bt_clear_frame_reg(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
}
- if (!is_spilled_reg(&func->stack[i])) {
- stack_mask &= ~(1ull << i);
- continue;
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
+ return -EFAULT;
+
+ if (!is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise) {
+ bt_clear_frame_slot(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
}
- reg = &func->stack[i].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, st, fr, true);
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "parent %s regs=%x stack=%llx marks:",
- new_marks ? "didn't have" : "already had",
- reg_mask, stack_mask);
- print_verifier_state(env, func, true);
}
- if (!reg_mask && !stack_mask)
- break;
- if (!new_marks)
- break;
+ if (bt_empty(bt))
+ return 0;
+ subseq_idx = first_idx;
last_idx = st->last_insn_idx;
first_idx = st->first_insn_idx;
}
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ }
+
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, env->cur_state, regno, NULL);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, starting_state, -1, NULL);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -2767,10 +4955,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BUF:
- case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
+ case PTR_TO_ARENA:
return true;
default:
return false;
@@ -2783,23 +4971,17 @@ static bool register_is_null(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
-static bool register_is_const(struct bpf_reg_state *reg)
-{
- return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
-}
-
-static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
{
- return tnum_is_unknown(reg->var_off) &&
- reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
- reg->umin_value == 0 && reg->umax_value == U64_MAX &&
- reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
- reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
+ return reg->type == SCALAR_VALUE &&
+ tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
}
-static bool register_is_bounded(struct bpf_reg_state *reg)
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
{
- return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
+ return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -2811,22 +4993,86 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
-static void save_register_state(struct bpf_func_state *state,
+static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
+ struct bpf_reg_state *src_reg)
+{
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
+ /* Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by sync_linked_regs() to
+ * propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+}
+
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ *dst = *src;
+}
+
+static void save_register_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
state->stack[spi].slot_type[i - 1] = STACK_SPILL;
/* size < 8 bytes spill */
for (; i; i--)
- scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
+ mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
+}
+
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
+static int get_reg_width(struct bpf_reg_state *reg)
+{
+ return fls64(reg->umax_value);
+}
+
+/* See comment for mark_fastcall_pattern_for_call() */
+static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int insn_idx, int off)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i;
+
+ if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
+ return;
+ /* access to the region [max_stack_depth .. fastcall_stack_off)
+ * from something that is not a part of the fastcall pattern,
+ * disable fastcall rewrites for current subprogram by setting
+ * fastcall_stack_off to a value smaller than any possible offset.
+ */
+ subprog->fastcall_stack_off = S16_MIN;
+ /* reset fastcall aux flags within subprogram,
+ * happens at most once per subprogram
+ */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ aux[i].fastcall_spills_num = 0;
+ aux[i].fastcall_pattern = 0;
+ }
}
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
@@ -2840,17 +5086,16 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
+ int insn_flags = insn_stack_access_flags(state->frameno, spi);
- err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
- if (err)
- return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- state->stack[spi].slot_type[0] == STACK_SPILL &&
+ is_spilled_reg(&state->stack[spi]) &&
+ !is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -2863,31 +5108,55 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true;
break;
}
}
if (sanitize)
- env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ env->insn_aux_data[insn_idx].nospec_result = true;
}
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
+ if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
+ /* only mark the slot as written if all 8 bytes were written
+ * otherwise read propagation may incorrectly stop too soon
+ * when stack slots are partially written.
+ * This heuristic means that read propagation will be
+ * conservative, since it will add reg_live_read marks
+ * to stack slots all the way to first state when programs
+ * writes+reads less than 8 bytes
+ */
+ bpf_mark_stack_write(env, state->frameno, BIT(spi));
+ }
+
+ check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
- if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
- !register_is_null(reg) && env->bpf_capable) {
- if (dst_reg != BPF_REG_FP) {
- /* The backtracking logic can only recognize explicit
- * stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conservative.
- * Backtrack from here and mark all registers as precise
- * that contributed into 'reg' being a constant.
- */
- err = mark_chain_precision(env, value_regno);
- if (err)
- return err;
- }
- save_register_state(state, spi, reg, size);
+ if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
+ bool reg_value_fits;
+
+ reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
+ /* Make sure that reg had an ID to build a relation on spill. */
+ if (reg_value_fits)
+ assign_scalar_id_before_mov(env, reg);
+ save_register_state(env, state, spi, reg, size);
+ /* Break the relation on a narrowing spill. */
+ if (!reg_value_fits)
+ state->stack[spi].spilled_ptr.id = 0;
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ env->bpf_capable) {
+ struct bpf_reg_state *tmp_reg = &env->fake_reg[0];
+
+ memset(tmp_reg, 0, sizeof(*tmp_reg));
+ __mark_reg_known(tmp_reg, insn->imm);
+ tmp_reg->type = SCALAR_VALUE;
+ save_register_state(env, state, spi, tmp_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -2899,31 +5168,26 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- save_register_state(state, spi, reg, size);
+ save_register_state(env, state, spi, reg, size);
} else {
u8 type = STACK_MISC;
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (is_spilled_reg(&state->stack[spi]))
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
- /* only mark the slot as written if all 8 bytes were written
- * otherwise read propagation may incorrectly stop too soon
- * when stack slots are partially written.
- * This heuristic means that read propagation will be
- * conservative, since it will add reg_live_read marks
- * to stack slots all the way to first state when programs
- * writes+reads less than 8 bytes
- */
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
-
/* when we zero initialize stack slots mark them as such */
- if (reg && register_is_null(reg)) {
- /* backtracking doesn't work for STACK_ZERO yet. */
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
+ /* STACK_ZERO case happened because register spill
+ * wasn't properly aligned at the stack slot boundary,
+ * so it's not a register spill anymore; force
+ * originating register to be precise to make
+ * STACK_ZERO correct for subsequent states
+ */
err = mark_chain_precision(env, value_regno);
if (err)
return err;
@@ -2932,9 +5196,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
- state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
- type;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+ insn_flags = 0; /* not a register spill */
}
+
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -2967,6 +5234,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
int min_off, max_off;
int i, err;
struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
bool writing_zero = false;
/* set if the fact that we're writing a zero is used to let any
* stack slots remain STACK_ZERO
@@ -2979,14 +5247,20 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if (value_reg && register_is_null(value_reg))
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
- err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
- if (err)
- return err;
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
+ check_fastcall_stack_contract(env, state, insn_idx, min_off);
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
u8 new_type, *stype;
@@ -2997,21 +5271,37 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
mark_stack_slot_scratched(env, spi);
- if (!env->allow_ptr_leaks
- && *stype != NOT_INIT
- && *stype != SCALAR_VALUE) {
- /* Reject the write if there's are spilled pointers in
- * range. If we didn't reject here, the ptr status
- * would be erased below (even though not all slots are
- * actually overwritten), possibly opening the door to
- * leaks.
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i);
return -EINVAL;
}
- /* Erase all spilled pointers. */
+ /* If writing_zero and the spi slot contains a spill of value 0,
+ * maintain the spill type.
+ */
+ if (writing_zero && *stype == STACK_SPILL &&
+ is_spilled_scalar_reg(&state->stack[spi])) {
+ struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
+
+ if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
+ zero_used = true;
+ continue;
+ }
+ }
+
+ /* Erase all other spilled pointers. */
state->stack[spi].spilled_ptr.type = NOT_INIT;
/* Update the slot type. */
@@ -3066,32 +5356,21 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
for (i = min_off; i < max_off; i++) {
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
+ mark_stack_slot_scratched(env, spi);
stype = ptr_state->stack[spi].slot_type;
if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
break;
zeros++;
}
if (zeros == max_off - min_off) {
- /* any access_size read into register is zero extended,
- * so the whole register == const_zero
- */
- __mark_reg_const_zero(&state->regs[dst_regno]);
- /* backtracking doesn't support STACK_ZERO yet,
- * so mark it precise here, so that later
- * backtracking can stop here.
- * Backtracking may not need this if this register
- * doesn't participate in pointer adjustment.
- * Forward propagation of precise flag is not
- * necessary either. This mark is only to stop
- * backtracking. Any register that contributed
- * to const 0 was marked precise before spill.
+ /* Any access_size read into register is zero extended,
+ * so the whole register == const_zero.
*/
- state->regs[dst_regno].precise = true;
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
} else {
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
}
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
/* Read the stack at 'off' and put the results into the register indicated by
@@ -3113,10 +5392,18 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
u8 *stype, type;
+ int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+ int err;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
+ mark_stack_slot_scratched(env, spi);
+ check_fastcall_stack_contract(env, state, env->insn_idx, off);
+ err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -3130,43 +5417,66 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno < 0)
return 0;
- if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ if (size <= spill_size &&
+ bpf_stack_narrow_access_ok(off, size, spill_size)) {
/* The earlier check_reg_arg() has decided the
* subreg_def for this insn. Save it first.
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
+
+ /* Break the relation on a narrowing fill.
+ * coerce_reg_to_size will adjust the boundaries.
+ */
+ if (get_reg_width(reg) > size * BITS_PER_BYTE)
+ state->regs[dst_regno].id = 0;
} else {
+ int spill_cnt = 0, zero_cnt = 0;
+
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
- if (type == STACK_SPILL)
+ if (type == STACK_SPILL) {
+ spill_cnt++;
continue;
+ }
if (type == STACK_MISC)
continue;
+ if (type == STACK_ZERO) {
+ zero_cnt++;
+ continue;
+ }
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
- mark_reg_unknown(env, state->regs, dst_regno);
- }
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
- return 0;
- }
- if (dst_regno >= 0) {
+ if (spill_cnt == size &&
+ tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ /* this IS register fill, so keep insn_flags */
+ } else if (zero_cnt == size) {
+ /* similarly to mark_reg_stack_read(), preserve zeroes */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ insn_flags = 0; /* not restoring original register state */
+ } else {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ insn_flags = 0; /* not restoring original register state */
+ }
+ }
+ } else if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
/* If dst_regno==-1, the caller is asking us whether
* it is acceptable to use this value as a SCALAR_VALUE
@@ -3178,7 +5488,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
off);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
@@ -3186,18 +5495,22 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_ZERO)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ insn_flags = 0; /* we are not restoring spilled register */
}
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
-enum stack_access_src {
+enum bpf_access_src {
ACCESS_DIRECT = 1, /* the access is performed by an instruction */
ACCESS_HELPER = 2, /* the access is performed by a helper */
};
@@ -3205,7 +5518,7 @@ enum stack_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum stack_access_src type,
+ enum bpf_access_type type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -3238,13 +5551,14 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
err = check_stack_range_initialized(env, ptr_regno, off, size,
- false, ACCESS_DIRECT, NULL);
+ false, BPF_READ, NULL);
if (err)
return err;
min_off = reg->smin_value + off;
max_off = reg->smax_value + off;
mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
return 0;
}
@@ -3281,17 +5595,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
/* Variable offset is prohibited for unprivileged mode for simplicity
* since it requires corresponding support in Spectre masking for stack
- * ALU. See also retrieve_ptr_limit().
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
*/
- if (!env->bypass_spec_v1 && var_off) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- ptr_regno, tn_buf);
- return -EACCES;
- }
-
if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
@@ -3455,54 +5765,364 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
+{
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
+ */
+
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+ struct btf_field *kptr_field,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags;
+ const char *reg_name = "";
+
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ perm_flags |= MEM_PERCPU;
+ }
+
+ if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ goto bad_type;
+
+ /* We need to verify reg->type and reg->btf, before accessing reg->btf */
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
+
+ /* For ref_ptr case, release function check should ensure we get one
+ * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+ * normal store of unreferenced kptr, we must ensure var_off is zero.
+ * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+ * reg->off and reg->ref_obj_id are not needed here.
+ */
+ if (__check_ptr_off_reg(env, reg, regno, true))
+ return -EACCES;
+
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
+ * we also need to take into account the reg->off.
+ *
+ * We want to support cases like:
+ *
+ * struct foo {
+ * struct bar br;
+ * struct baz bz;
+ * };
+ *
+ * struct foo *v;
+ * v = func(); // PTR_TO_BTF_ID
+ * val->foo = v; // reg->off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * // first member type of struct after comparison fails
+ * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * // to match type
+ *
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * is zero. We must also ensure that btf_struct_ids_match does not walk
+ * the struct to match type against first member of struct, i.e. reject
+ * second case from above. Hence, when type is BPF_KPTR_REF, we set
+ * strict mode to true for type match.
+ */
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type != BPF_KPTR_UNREF))
+ goto bad_type;
+ return 0;
+bad_type:
+ verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+ reg_type_str(env, reg->type), reg_name);
+ verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+ targ_name);
+ else
+ verbose(env, "\n");
+ return -EINVAL;
+}
+
+static bool in_sleepable(struct bpf_verifier_env *env)
+{
+ return env->cur_state->in_sleepable;
+}
+
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_locks ||
+ env->cur_state->active_locks ||
+ !in_sleepable(env);
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
+BTF_ID(struct, prog_test_ref_kfunc)
+#endif
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+#endif
+#ifdef CONFIG_BPF_JIT
+BTF_ID(struct, bpf_cpumask)
+#endif
+BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
+BTF_ID(struct, bpf_crypto_ctx)
+#endif
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return true;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
+{
+ struct btf_struct_meta *meta;
+
+ if (btf_is_kernel(kptr_field->kptr.btf))
+ return NULL;
+
+ meta = btf_find_struct_meta(kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id);
+
+ return meta ? meta->record : NULL;
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_PERCPU ||
+ (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+ struct btf_record *rec;
+ u32 ret;
+
+ ret = PTR_MAYBE_NULL;
+ if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+ ret |= MEM_RCU;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ ret |= MEM_PERCPU;
+ else if (!btf_is_kernel(kptr_field->kptr.btf))
+ ret |= MEM_ALLOC;
+
+ rec = kptr_pointee_btf_record(kptr_field);
+ if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
+ ret |= NON_OWN_REF;
+ } else {
+ ret |= PTR_UNTRUSTED;
+ }
+
+ return ret;
+}
+
+static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
+ struct btf_field *field)
+{
+ struct bpf_reg_state *reg;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ mark_reg_known_zero(env, cur_regs(env), regno);
+ reg = reg_state(env, regno);
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->mem_size = t->size;
+ reg->id = ++env->id_gen;
+
+ return 0;
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+ int value_regno, int insn_idx,
+ struct btf_field *kptr_field)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int class = BPF_CLASS(insn->code);
+ struct bpf_reg_state *val_reg;
+ int ret;
+
+ /* Things we already checked for in check_map_access and caller:
+ * - Reject cases where variable offset may touch kptr
+ * - size of access (must be BPF_DW)
+ * - tnum_is_const(reg->var_off)
+ * - kptr_field->offset == off + reg->var_off.value
+ */
+ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+ return -EACCES;
+ }
+
+ /* We only allow loading referenced kptr, since it will be marked as
+ * untrusted, similar to unreferenced kptr.
+ */
+ if (class != BPF_LDX &&
+ (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
+ verbose(env, "store to referenced kptr disallowed\n");
+ return -EACCES;
+ }
+ if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
+ verbose(env, "store to uptr disallowed\n");
+ return -EACCES;
+ }
+
+ if (class == BPF_LDX) {
+ if (kptr_field->type == BPF_UPTR)
+ return mark_uptr_ld_reg(env, value_regno, kptr_field);
+
+ /* We can simply mark the value_regno receiving the pointer
+ * value from map as PTR_TO_BTF_ID, with the correct type.
+ */
+ ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ btf_ld_kptr_type(env, kptr_field));
+ if (ret < 0)
+ return ret;
+ } else if (class == BPF_STX) {
+ val_reg = reg_state(env, value_regno);
+ if (!register_is_null(val_reg) &&
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
+ return -EACCES;
+ } else if (class == BPF_ST) {
+ if (insn->imm) {
+ verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+ kptr_field->offset);
+ return -EACCES;
+ }
+ } else {
+ verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+ return -EACCES;
+ }
+ return 0;
+}
+
+/*
+ * Return the size of the memory region accessible from a pointer to map value.
+ * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
+ */
+static u32 map_mem_size(const struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ return map->max_entries * sizeof(long);
+
+ return map->value_size;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+ int off, int size, bool zero_size_allowed,
+ enum bpf_access_src src)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ u32 mem_size = map_mem_size(map);
+ struct btf_record *rec;
+ int err, i;
- err = check_mem_region_access(env, regno, off, size, map->value_size,
- zero_size_allowed);
+ err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_timer(map)) {
- u32 t = map->timer_off;
-
- if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
- t < reg->umax_value + off + size) {
- verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
- return -EACCES;
+ if (reg->smin_value + off < p + field->size &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (src != ACCESS_DIRECT) {
+ verbose(env, "%s cannot be accessed indirectly by helper\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s access cannot have variable offset\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ if (p != off + reg->var_off.value) {
+ verbose(env, "%s access misaligned expected=%u off=%llu\n",
+ btf_field_type_name(field->type),
+ p, off + reg->var_off.value);
+ return -EACCES;
+ }
+ if (size != bpf_size_to_bytes(BPF_DW)) {
+ verbose(env, "%s access size must be BPF_DW\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
- return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
@@ -3589,16 +6209,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, enum bpf_reg_type *reg_type,
- struct btf **btf, u32 *btf_id)
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
- struct bpf_insn_access_aux info = {
- .reg_type = *reg_type,
- .log = &env->log,
- };
-
if (env->ops->is_valid_access &&
- env->ops->is_valid_access(off, size, t, env->prog, &info)) {
+ env->ops->is_valid_access(off, size, t, env->prog, info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -3606,13 +6220,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- *reg_type = info.reg_type;
-
- if (base_type(*reg_type) == PTR_TO_BTF_ID) {
- *btf = info.btf;
- *btf_id = info.btf_id;
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
+ if (info->ref_obj_id &&
+ !find_reference_state(env->cur_state, info->ref_obj_id)) {
+ verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
+ off);
+ return -EACCES;
+ }
} else {
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
}
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
@@ -3715,6 +6331,76 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_ARENA;
+}
+
+/* Return false if @regno contains a pointer whose type isn't supported for
+ * atomic instruction @insn.
+ */
+static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno,
+ struct bpf_insn *insn)
+{
+ if (is_ctx_reg(env, regno))
+ return false;
+ if (is_pkt_reg(env, regno))
+ return false;
+ if (is_flow_key_reg(env, regno))
+ return false;
+ if (is_sk_reg(env, regno))
+ return false;
+ if (is_arena_reg(env, regno))
+ return bpf_jit_supports_insn(insn, true);
+
+ return true;
+}
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+ [CONST_PTR_TO_MAP] = btf_bpf_map_id,
+};
+
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* Types listed in the reg2btf_ids are always trusted */
+ if (reg2btf_ids[base_type(reg->type)] &&
+ !bpf_type_has_unsafe_modifiers(reg->type))
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -3796,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
+ if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ strict = true;
break;
case PTR_TO_CTX:
pointer_desc = "context ";
@@ -3820,6 +6508,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_XDP_SOCK:
pointer_desc = "xdp_sock ";
break;
+ case PTR_TO_ARENA:
+ return 0;
default:
break;
}
@@ -3827,18 +6517,43 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
-static int update_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_func_state *func,
- int off)
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
{
- u16 stack = env->subprog_info[func->subprogno].stack_depth;
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
- if (stack >= -off)
- return 0;
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
- /* update known max for given subprogram */
- env->subprog_info[func->subprogno].stack_depth = -off;
- return 0;
+ return NO_PRIV_STACK;
+}
+
+static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
+{
+ if (env->prog->jit_requested)
+ return round_up(stack_depth, 16);
+
+ /* round up to 32-bytes, since this is granularity
+ * of interpreter stack size
+ */
+ return round_up(max_t(u32, stack_depth, 1), 32);
}
/* starting from main bpf function walk all instructions of the function
@@ -3847,16 +6562,20 @@ static int update_stack_depth(struct bpf_verifier_env *env,
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
- int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
+ i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -3883,19 +6602,57 @@ process_func:
depth);
return -EACCES;
}
- /* round up to 32-bytes, since this is granularity
- * of interpreter stack size
- */
- depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- int next_insn;
+ int next_insn, sidx;
+
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
@@ -3905,21 +6662,26 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
- idx = find_subprog(env, next_insn);
- if (idx < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- next_insn);
+ sidx = find_subprog(env, next_insn);
+ if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
- }
- if (subprog[idx].is_async_cb) {
- if (subprog[idx].has_tail_call) {
- verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
+ verifier_bug(env, "subprog has tail_call and async cb");
return -EFAULT;
}
- /* async callbacks don't increase bpf prog stack size */
- continue;
+ /* async callbacks don't increase bpf prog stack size unless called directly */
+ if (!bpf_pseudo_call(insn + i))
+ continue;
+ if (subprog[sidx].is_exception_cb) {
+ verbose(env, "insn %d cannot call exception cb directly", i);
+ return -EINVAL;
+ }
}
i = next_insn;
+ idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -3938,8 +6700,13 @@ continue_func:
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
if (tail_call_reachable)
- for (j = 0; j < frame; j++)
+ for (j = 0; j < frame; j++) {
+ if (subprog[ret_prog[j]].is_exception_cb) {
+ verbose(env, "cannot tail call within exception cb\n");
+ return -EINVAL;
+ }
subprog[ret_prog[j]].tail_call_reachable = true;
+ }
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -3948,60 +6715,70 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
goto continue_func;
}
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-static int get_callee_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_insn *insn, int idx)
+static int check_max_stack_depth(struct bpf_verifier_env *env)
{
- int start = idx + insn->imm + 1, subprog;
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
+ struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
+ int ret;
- subprog = find_subprog(env, start);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- start);
- return -EFAULT;
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
}
- return env->subprog_info[subprog].stack_depth;
-}
-#endif
-static int __check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
- bool fixed_off_ok)
-{
- /* Access to this pointer-typed register or passing it to a helper
- * is only allowed in its original, unmodified form.
- */
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
- if (!fixed_off_ok && reg->off) {
- verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
- return -EACCES;
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
+ if (!i || si[i].is_async_cb) {
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
+ if (ret < 0)
+ return ret;
+ }
}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable %s access var_off=%s disallowed\n",
- reg_type_str(env, reg->type), tn_buf);
- return -EACCES;
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
}
return 0;
}
-int check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int idx)
{
- return __check_ptr_off_reg(env, reg, regno, false);
+ int start = idx + insn->imm + 1, subprog;
+
+ subprog = find_subprog(env, start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
+ return -EFAULT;
+ return env->subprog_info[subprog].stack_depth;
}
+#endif
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
@@ -4047,9 +6824,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size,
bool zero_size_allowed,
- const char *buf_info,
u32 *max_access)
{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -4095,9 +6872,153 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
* values are also truncated so we push 64-bit bounds into
* 32-bit bounds. Above were truncated < 32-bits already.
*/
- if (size >= 4)
+ if (size < 4)
+ __mark_reg32_unbounded(reg);
+
+ reg_bounds_sync(reg);
+}
+
+static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->smin_value = reg->s32_min_value = S8_MIN;
+ reg->smax_value = reg->s32_max_value = S8_MAX;
+ } else if (size == 2) {
+ reg->smin_value = reg->s32_min_value = S16_MIN;
+ reg->smax_value = reg->s32_max_value = S16_MAX;
+ } else {
+ /* size == 4 */
+ reg->smin_value = reg->s32_min_value = S32_MIN;
+ reg->smax_value = reg->s32_max_value = S32_MAX;
+ }
+ reg->umin_value = reg->u32_min_value = 0;
+ reg->umax_value = U64_MAX;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_unknown;
+}
+
+static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
+ u64 top_smax_value, top_smin_value;
+ u64 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u64_cval = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u64_cval);
+ else if (size == 2)
+ reg->var_off = tnum_const((s16)u64_cval);
+ else
+ /* size == 4 */
+ reg->var_off = tnum_const((s32)u64_cval);
+
+ u64_cval = reg->var_off.value;
+ reg->smax_value = reg->smin_value = u64_cval;
+ reg->umax_value = reg->umin_value = u64_cval;
+ reg->s32_max_value = reg->s32_min_value = u64_cval;
+ reg->u32_max_value = reg->u32_min_value = u64_cval;
+ return;
+ }
+
+ top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s64_min and s64_min after sign extension */
+ if (size == 1) {
+ init_s64_max = (s8)reg->smax_value;
+ init_s64_min = (s8)reg->smin_value;
+ } else if (size == 2) {
+ init_s64_max = (s16)reg->smax_value;
+ init_s64_min = (s16)reg->smin_value;
+ } else {
+ init_s64_max = (s32)reg->smax_value;
+ init_s64_min = (s32)reg->smin_value;
+ }
+
+ s64_max = max(init_s64_max, init_s64_min);
+ s64_min = min(init_s64_max, init_s64_min);
+
+ /* both of s64_max/s64_min positive or negative */
+ if ((s64_max >= 0) == (s64_min >= 0)) {
+ reg->s32_min_value = reg->smin_value = s64_min;
+ reg->s32_max_value = reg->smax_value = s64_max;
+ reg->u32_min_value = reg->umin_value = s64_min;
+ reg->u32_max_value = reg->umax_value = s64_max;
+ reg->var_off = tnum_range(s64_min, s64_max);
+ return;
+ }
+
+out:
+ set_sext64_default_val(reg, size);
+}
+
+static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->s32_min_value = S8_MIN;
+ reg->s32_max_value = S8_MAX;
+ } else {
+ /* size == 2 */
+ reg->s32_min_value = S16_MIN;
+ reg->s32_max_value = S16_MAX;
+ }
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_subreg(tnum_unknown);
+}
+
+static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
+ u32 top_smax_value, top_smin_value;
+ u32 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u32_val = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u32_val);
+ else
+ reg->var_off = tnum_const((s16)u32_val);
+
+ u32_val = reg->var_off.value;
+ reg->s32_min_value = reg->s32_max_value = u32_val;
+ reg->u32_min_value = reg->u32_max_value = u32_val;
+ return;
+ }
+
+ top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s32_min and s32_min after sign extension */
+ if (size == 1) {
+ init_s32_max = (s8)reg->s32_max_value;
+ init_s32_min = (s8)reg->s32_min_value;
+ } else {
+ /* size == 2 */
+ init_s32_max = (s16)reg->s32_max_value;
+ init_s32_min = (s16)reg->s32_min_value;
+ }
+ s32_max = max(init_s32_max, init_s32_min);
+ s32_min = min(init_s32_max, init_s32_min);
+
+ if ((s32_min >= 0) == (s32_max >= 0)) {
+ reg->s32_min_value = s32_min;
+ reg->s32_max_value = s32_max;
+ reg->u32_min_value = (u32)s32_min;
+ reg->u32_max_value = (u32)s32_max;
+ reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
return;
- __reg_combine_64_into_32(reg);
+ }
+
+out:
+ set_sext32_default_val(reg, size);
}
static bool bpf_map_is_rdonly(const struct bpf_map *map)
@@ -4120,7 +7041,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map)
!bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -4133,13 +7055,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
switch (size) {
case sizeof(u8):
- *val = (u64)*(u8 *)ptr;
+ *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
break;
case sizeof(u16):
- *val = (u64)*(u16 *)ptr;
+ *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
break;
case sizeof(u32):
- *val = (u64)*(u32 *)ptr;
+ *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
break;
case sizeof(u64):
*val = *(u64 *)ptr;
@@ -4150,6 +7072,135 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
+ struct cgroup *cgroup;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+#ifdef CONFIG_MEMCG
+ struct task_struct __rcu *owner;
+#endif
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) {
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
+ struct mm_struct *vm_mm;
+ struct file *vm_file;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
+
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
+}
+
+static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
+ "__safe_trusted_or_null");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -4159,9 +7210,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
- u32 btf_id;
+ const char *field_name = NULL;
+ enum bpf_type_flag flag = 0;
+ u32 btf_id = 0;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4178,24 +7243,113 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id);
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
+ if (!btf_is_kernel(reg->btf)) {
+ verifier_bug(env, "reg->btf must be kernel btf");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id);
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
+ verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
return ret;
- if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
+
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
+ */
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
+ */
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+
+ /* We always trust them */
+ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
+ flag & PTR_UNTRUSTED)
+ flag &= ~PTR_UNTRUSTED;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
+ }
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
+ }
+
+ if (atype == BPF_READ && value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
return 0;
}
@@ -4208,6 +7362,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
+ enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
u32 btf_id;
@@ -4227,9 +7383,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!env->allow_ptr_to_map_access) {
+ if (!env->allow_ptr_leaks) {
verbose(env,
- "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname);
return -EPERM;
}
@@ -4245,12 +7401,21 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID,
+ btf_vmlinux, *map->ops->map_btf_id, 0);
+ if (ret < 0)
+ return ret;
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
- if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+ if (value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
return 0;
}
@@ -4261,13 +7426,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
* The minimum valid offset is -MAX_BPF_STACK for writes, and
* -state->allocated_stack for reads.
*/
-static int check_stack_slot_within_bounds(int off,
- struct bpf_func_state *state,
- enum bpf_access_type t)
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+ s64 off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
{
int min_valid_off;
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE || env->allow_uninit_stack)
min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -4285,29 +7451,23 @@ static int check_stack_slot_within_bounds(int off,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum stack_access_src src, enum bpf_access_type type)
+ enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
struct bpf_func_state *state = func(env, reg);
- int min_off, max_off;
+ s64 min_off, max_off;
int err;
char *err_extra;
- if (src == ACCESS_HELPER)
- /* We don't know if helpers are reading or writing (or both). */
- err_extra = " indirect access to";
- else if (type == BPF_READ)
+ if (type == BPF_READ)
err_extra = " read from";
else
err_extra = " write to";
if (tnum_is_const(reg->var_off)) {
- min_off = reg->var_off.value + off;
- if (access_size > 0)
- max_off = min_off + access_size - 1;
- else
- max_off = min_off;
+ min_off = (s64)reg->var_off.value + off;
+ max_off = min_off + access_size;
} else {
if (reg->smax_value >= BPF_MAX_VAR_OFF ||
reg->smin_value <= -BPF_MAX_VAR_OFF) {
@@ -4316,15 +7476,17 @@ static int check_stack_access_within_bounds(
return -EACCES;
}
min_off = reg->smin_value + off;
- if (access_size > 0)
- max_off = reg->smax_value + off + access_size - 1;
- else
- max_off = min_off;
+ max_off = reg->smax_value + off + access_size;
}
- err = check_stack_slot_within_bounds(min_off, state, type);
- if (!err)
- err = check_stack_slot_within_bounds(max_off, state, type);
+ err = check_stack_slot_within_bounds(env, min_off, state, type);
+ if (!err && max_off > 0)
+ err = -EINVAL; /* out of stack access into non-negative offsets */
+ if (!err && access_size < 0)
+ /* access_size should not be negative (or overflow an int); others checks
+ * along the way should have prevented such an access.
+ */
+ err = -EFAULT; /* invalid negative access size; integer overflow? */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -4334,11 +7496,27 @@ static int check_stack_access_within_bounds(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
- err_extra, regno, tn_buf, access_size);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+ err_extra, regno, tn_buf, off, access_size);
}
+ return err;
}
- return err;
+
+ /* Note that there is no stack access with offset zero, so the needed stack
+ * size is -min_off, not -min_off+1.
+ */
+ return grow_stack_state(env, state, -min_off /* size */);
+}
+
+static bool get_func_retval_range(struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ if (prog->type == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_MAC &&
+ !bpf_lsm_get_retval_range(prog, range)) {
+ return true;
+ }
+ return false;
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -4349,11 +7527,10 @@ static int check_stack_access_within_bounds(
*/
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
int off, int bpf_size, enum bpf_access_type t,
- int value_regno, bool strict_alignment_once)
+ int value_regno, bool strict_alignment_once, bool is_ldsx)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
- struct bpf_func_state *state;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -4381,6 +7558,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
+ struct btf_field *kptr_field = NULL;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4389,30 +7568,50 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_map_access_type(env, regno, off, size, t);
if (err)
return err;
- err = check_map_access(env, regno, off, size, false);
- if (!err && t == BPF_READ && value_regno >= 0) {
+ err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ if (err)
+ return err;
+ if (tnum_is_const(reg->var_off))
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+ } else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
- /* if map is read-only, track its contents as scalars */
+ /*
+ * If map is read-only, track its contents as scalars,
+ * unless it is an insn array (see the special case below)
+ */
if (tnum_is_const(reg->var_off) &&
bpf_map_is_rdonly(map) &&
- map->ops->map_direct_value_addr) {
+ map->ops->map_direct_value_addr &&
+ map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
int map_off = off + reg->var_off.value;
u64 val = 0;
err = bpf_map_direct_read(map, map_off, size,
- &val);
+ &val, is_ldsx);
if (err)
return err;
regs[value_regno].type = SCALAR_VALUE;
__mark_reg_known(&regs[value_regno], val);
+ } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ if (bpf_size != BPF_DW) {
+ verbose(env, "Invalid read of %d bytes from insn_array\n",
+ size);
+ return -EACCES;
+ }
+ copy_register_state(&regs[value_regno], reg);
+ regs[value_regno].type = PTR_TO_INSN;
} else {
mark_reg_unknown(env, regs, value_regno);
}
}
} else if (base_type(reg->type) == PTR_TO_MEM) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);
if (type_may_be_null(reg->type)) {
verbose(env, "R%d invalid mem access '%s'\n", regno,
@@ -4432,14 +7631,22 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_mem_region_access(env, regno, off, size,
- reg->mem_size, false);
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to check bounds in that case.
+ */
+ if (!rdonly_untrusted)
+ err = check_mem_region_access(env, regno, off, size,
+ reg->mem_size, false);
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = SCALAR_VALUE;
- struct btf *btf = NULL;
- u32 btf_id = 0;
+ struct bpf_retval_range range;
+ struct bpf_insn_access_aux info = {
+ .reg_type = SCALAR_VALUE,
+ .is_ldsx = is_ldsx,
+ .log = &env->log,
+ };
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -4451,7 +7658,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &info);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4459,12 +7666,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
+ if (info.reg_type == SCALAR_VALUE) {
+ if (info.is_retval && get_func_retval_range(env->prog, &range)) {
+ err = __mark_reg_s32_range(env, regs, value_regno,
+ range.minval, range.maxval);
+ if (err)
+ return err;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (type_may_be_null(reg_type))
+ if (type_may_be_null(info.reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -4472,22 +7686,18 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (base_type(reg_type) == PTR_TO_BTF_ID) {
- regs[value_regno].btf = btf;
- regs[value_regno].btf_id = btf_id;
+ if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
+ regs[value_regno].btf = info.btf;
+ regs[value_regno].btf_id = info.btf_id;
+ regs[value_regno].ref_obj_id = info.ref_obj_id;
}
}
- regs[value_regno].type = reg_type;
+ regs[value_regno].type = info.reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
/* Basic bounds checks. */
- err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
- if (err)
- return err;
-
- state = func(env, reg);
- err = update_stack_depth(env, state, off);
+ err = check_stack_access_within_bounds(env, regno, off, size, t);
if (err)
return err;
@@ -4535,7 +7745,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -4543,7 +7754,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
value_regno);
} else if (base_type(reg->type) == PTR_TO_BUF) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
- const char *buf_info;
u32 *max_access;
if (rdonly_mem) {
@@ -4552,18 +7762,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
regno, reg_type_str(env, reg->type));
return -EACCES;
}
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
err = check_buffer_access(env, reg, regno, off, size, false,
- buf_info, max_access);
+ max_access);
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_ARENA) {
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str(env, reg->type));
@@ -4572,33 +7783,84 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
- /* b/h/w load zero-extends, mark upper bits as known 0 */
- coerce_reg_to_size(&regs[value_regno], size);
+ if (!is_ldsx)
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ coerce_reg_to_size(&regs[value_regno], size);
+ else
+ coerce_reg_to_size_sx(&regs[value_regno], size);
}
return err;
}
-static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch);
+
+static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once, bool is_ldsx,
+ bool allow_trust_mismatch, const char *ctx)
{
- int load_reg;
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type src_reg_type;
int err;
- switch (insn->imm) {
- case BPF_ADD:
- case BPF_ADD | BPF_FETCH:
- case BPF_AND:
- case BPF_AND | BPF_FETCH:
- case BPF_OR:
- case BPF_OR | BPF_FETCH:
- case BPF_XOR:
- case BPF_XOR | BPF_FETCH:
- case BPF_XCHG:
- case BPF_CMPXCHG:
- break;
- default:
- verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
- return -EINVAL;
- }
+ /* check src operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dst operand */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* Check if (src_reg + off) is readable. The state of dst_reg will be
+ * updated by this call.
+ */
+ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
+ strict_alignment_once, is_ldsx);
+ err = err ?: save_aux_ptr_type(env, src_reg_type,
+ allow_trust_mismatch);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], ctx);
+
+ return err;
+}
+
+static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type dst_reg_type;
+ int err;
+
+ /* check src1 operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = regs[insn->dst_reg].type;
+
+ /* Check if (dst_reg + off) is writeable. */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
+ strict_alignment_once, false);
+ err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
+
+ return err;
+}
+
+static int check_atomic_rmw(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int load_reg;
+ int err;
if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
verbose(env, "invalid atomic operand size\n");
@@ -4634,10 +7896,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
return -EACCES;
}
- if (is_ctx_reg(env, insn->dst_reg) ||
- is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg)) {
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -4664,27 +7923,105 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
/* Check whether we can read the memory, with second call for fetch
* case to simulate the register fill.
*/
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1, true);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg,
- true);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, load_reg, true, false);
if (err)
return err;
+ if (is_arena_reg(env, insn->dst_reg)) {
+ err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+ if (err)
+ return err;
+ }
/* Check whether we can write into the same memory. */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_atomic_load(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_load_mem(env, insn, true, false, false, "atomic_load");
if (err)
return err;
+ if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
+ verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
+ insn->src_reg,
+ reg_type_str(env, reg_state(env, insn->src_reg)->type));
+ return -EACCES;
+ }
+
return 0;
}
+static int check_atomic_store(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_store_reg(env, insn, true);
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+ insn->dst_reg,
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ return check_atomic_rmw(env, insn);
+ case BPF_LOAD_ACQ:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit load-acquires are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_load(env, insn);
+ case BPF_STORE_REL:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit store-releases are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_store(env, insn);
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
+ insn->imm);
+ return -EINVAL;
+ }
+}
+
/* When register 'regno' is used to read the stack (either directly or through
* a helper function) make sure that it's within stack boundary and, depending
- * on the access type, that all elements of the stack are initialized.
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
*
* 'off' includes 'regno->off', but not its dynamic part (if any).
*
@@ -4694,13 +8031,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum stack_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
- char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
- enum bpf_access_type bounds_check_type;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
@@ -4711,18 +8046,10 @@ static int check_stack_range_initialized(
return -EACCES;
}
- if (type == ACCESS_HELPER) {
- /* The bounds checks for writes are more permissive than for
- * reads. However, if raw_mode is not set, we'll do extra
- * checks below.
- */
- bounds_check_type = BPF_WRITE;
+ if (type == BPF_WRITE)
clobber = true;
- } else {
- bounds_check_type = BPF_READ;
- }
- err = check_stack_access_within_bounds(env, regno, off, access_size,
- type, bounds_check_type);
+
+ err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -4739,8 +8066,8 @@ static int check_stack_range_initialized(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
- regno, err_extra, tn_buf);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -4757,6 +8084,31 @@ static int check_stack_range_initialized(
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -4767,12 +8119,16 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
- if (state->allocated_stack <= slot)
- goto err;
+ if (state->allocated_stack <= slot) {
+ verbose(env, "allocated_stack too small\n");
+ return -EFAULT;
+ }
+
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
- if (*stype == STACK_ZERO) {
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
if (clobber) {
/* helper can write anything into the stack */
*stype = STACK_MISC;
@@ -4781,10 +8137,6 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
- goto mark;
-
- if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -4795,35 +8147,39 @@ static int check_stack_range_initialized(
goto mark;
}
-err:
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
- err_extra, regno, min_off, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
+ regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
- err_extra, regno, tn_buf, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
+ regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
- mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent,
- REG_LIVE_READ64);
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+ /* We do not call bpf_mark_stack_write(), as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
- return update_stack_depth(env, state, min_off);
+ return 0;
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
+ int access_size, enum bpf_access_type access_type,
+ bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- const char *buf_info;
u32 *max_access;
switch (base_type(reg->type)) {
@@ -4832,38 +8188,70 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size,
- meta && meta->raw_mode ? BPF_WRITE :
- BPF_READ))
+ if (check_map_access_type(env, regno, reg->off, access_size, access_type))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
- zero_size_allowed);
+ zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ }
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode)
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
return -EACCES;
+ }
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- buf_info, max_access);
+ max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
regno, reg->off, access_size,
- zero_size_allowed, ACCESS_HELPER, meta);
+ zero_size_allowed, access_type, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ access_type, -1, false, false);
+ }
+
+ fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
@@ -4877,162 +8265,872 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno, u32 mem_size)
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ enum bpf_access_type access_type,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check happens using
+ * its boundaries. For unprivileged variable accesses, disable
+ * raw mode so that the program is required to initialize all
+ * the memory that the helper could just partially fill up.
+ */
+ if (!tnum_is_const(reg->var_off))
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0 && !zero_size_allowed) {
+ verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
+ regno, reg->umin_value, reg->umax_value);
+ return -EACCES;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ access_type, zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno, u32 mem_size)
{
+ bool may_be_null = type_may_be_null(reg->type);
+ struct bpf_reg_state saved_reg;
+ int err;
+
if (register_is_null(reg))
return 0;
- if (type_may_be_null(reg->type)) {
- /* Assuming that the register contains a value check if the memory
- * access is safe. Temporarily save and restore the register's state as
- * the conversion shouldn't be visible to a caller.
- */
- const struct bpf_reg_state saved_reg = *reg;
- int rv;
-
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ if (may_be_null) {
+ saved_reg = *reg;
mark_ptr_not_null_reg(reg);
- rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+ }
+
+ err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
+
+ if (may_be_null)
*reg = saved_reg;
- return rv;
+
+ return err;
+}
+
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ memset(&meta, 0, sizeof(meta));
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
}
- return check_helper_mem_access(env, regno, mem_size, true, NULL);
+ err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+
+ return err;
}
+enum {
+ PROCESS_SPIN_LOCK = (1 << 0),
+ PROCESS_RES_LOCK = (1 << 1),
+ PROCESS_LOCK_IRQ = (1 << 2),
+};
+
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * env->cur_state->active_locks remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
-static int process_spin_lock(struct bpf_verifier_env *env, int regno,
- bool is_lock)
+static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
+ bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
+ const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
+ bool is_irq = flags & PROCESS_LOCK_IRQ;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
+ u32 spin_lock_off;
+ int err;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
+ regno, lock_str);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
- map->name);
+ "map '%s' has to have BTF in order to use %s_lock\n",
+ map->name, lock_str);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr", lock_str);
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
+ if (spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
+ val + reg->off, lock_str, spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
- verbose(env,
- "Locking two bpf_spin_locks are not allowed\n");
- return -EINVAL;
+ void *ptr;
+ int type;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ } else if (is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
+ verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
+ return -EINVAL;
+ }
+ }
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr);
+ if (err < 0) {
+ verbose(env, "Failed to acquire lock state\n");
+ return err;
}
- cur->active_spin_lock = reg->id;
} else {
- if (!cur->active_spin_lock) {
- verbose(env, "bpf_spin_unlock without taking a lock\n");
+ void *ptr;
+ int type;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_locks) {
+ verbose(env, "%s_unlock without taking a lock\n", lock_str);
+ return -EINVAL;
+ }
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ if (!find_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
+ return -EINVAL;
+ }
+ if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
+ verbose(env, "%s_unlock cannot be out of order\n", lock_str);
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
- verbose(env, "bpf_spin_unlock of different lock\n");
+ if (release_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
return -EINVAL;
}
- cur->active_spin_lock = 0;
+
+ invalidate_non_owning_refs(env);
}
return 0;
}
-static int process_timer_func(struct bpf_verifier_env *env, int regno,
- struct bpf_call_arg_meta *meta)
+/* Check if @regno is a pointer to a specific field in a map value */
+static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+ enum btf_field_type field_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ const char *struct_name = btf_field_type_name(field_type);
+ int field_off = -1;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, struct_name);
return -EINVAL;
}
if (!map->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
- map->name);
+ verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name,
+ struct_name);
return -EINVAL;
}
- if (!map_value_has_timer(map)) {
- if (map->timer_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_timer'\n",
- map->name);
- else if (map->timer_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_timer'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_timer is mangled\n",
- map->name);
+ if (!btf_record_has_field(map->record, field_type)) {
+ verbose(env, "map '%s' has no valid %s\n", map->name, struct_name);
+ return -EINVAL;
+ }
+ switch (field_type) {
+ case BPF_TIMER:
+ field_off = map->record->timer_off;
+ break;
+ case BPF_TASK_WORK:
+ field_off = map->record->task_work_off;
+ break;
+ case BPF_WORKQUEUE:
+ field_off = map->record->wq_off;
+ break;
+ default:
+ verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
}
- if (map->timer_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->timer_off);
+ if (field_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
+ val + reg->off, struct_name, field_off);
return -EINVAL;
}
+ return 0;
+}
+
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TIMER);
+ if (err)
+ return err;
+
if (meta->map_ptr) {
- verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
+ return -EOPNOTSUPP;
+ }
meta->map_uid = reg->map_uid;
meta->map_ptr = map;
return 0;
}
-static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+static int process_wq_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_wq helper");
+ return -EFAULT;
+ }
+
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
+static int process_task_work_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_task_work helper");
+ return -EFAULT;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct btf_field *kptr_field;
+ struct bpf_map *map_ptr;
+ struct btf_record *rec;
+ u32 kptr_off;
+
+ if (type_is_ptr_alloc_obj(reg->type)) {
+ rec = reg_btf_record(reg);
+ } else { /* PTR_TO_MAP_VALUE */
+ map_ptr = reg->map_ptr;
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ rec = map_ptr->record;
+ meta->map_ptr = map_ptr;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+
+ if (!btf_record_has_field(rec, BPF_KPTR)) {
+ verbose(env, "R%d has no valid kptr\n", regno);
+ return -EINVAL;
+ }
+
+ kptr_off = reg->off + reg->var_off.value;
+ kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
+ verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+ return -EACCES;
+ }
+ if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
+ verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+ return -EACCES;
+ }
+ meta->kptr_field = kptr_field;
+ return 0;
+}
+
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type, int clone_ref_obj_id)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err;
+
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verifier_bug(env, "misconfigured dynptr helper type flags");
+ return -EFAULT;
+ }
+
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
+ return -EINVAL;
+ }
+
+ err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
+ const struct btf_param *arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ if (is_iter_kfunc(meta))
+ return arg_idx == 0;
+
+ /* iter passed as an argument to a generic kfunc */
+ return btf_param_match_suffix(meta->btf, arg, "__iter");
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ int spi, err, i, nr_slots, btf_id;
+
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ return -EINVAL;
+ }
+
+ /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
+ * ensures struct convention, so we wouldn't need to do any BTF
+ * validation here. But given iter state can be passed as a parameter
+ * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
+ * conservative here.
+ */
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ if (btf_id < 0) {
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(meta->btf, btf_id);
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno - 1);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy(), as well as any kfunc
+ * accepting iter argument, expect initialized iter state
+ */
+ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+ switch (err) {
+ case 0:
+ break;
+ case -EINVAL:
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno - 1);
+ return err;
+ case -EPROTO:
+ verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+ return err;
+ default:
+ return err;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
+ if (err)
+ return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur,
+ int insn_idx)
+{
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *st;
+ struct list_head *pos, *head;
+
+ /* Explored states are pushed in stack order, most recent states come first */
+ head = explored_state(env, insn_idx);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ /* If st->branches != 0 state is a part of current DFS verification path,
+ * hence cur & st for a loop.
+ */
+ st = &sl->state;
+ if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+ st->dfs_depth < cur->dfs_depth)
+ return st;
+ }
+
+ return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ if (rold->type != SCALAR_VALUE)
+ return;
+ if (rold->type != rcur->type)
+ return;
+ if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ return;
+ __mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr, num_slots;
+
+ reset_idmap_scratch(env);
+ for (fr = old->curframe; fr >= 0; fr--) {
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ maybe_widen_reg(env,
+ &fold->regs[i],
+ &fcur->regs[i],
+ &env->idmap_scratch);
+
+ num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
+ fcur->allocated_stack / BPF_REG_SIZE);
+ for (i = 0; i < num_slots; i++) {
+ if (!is_spilled_reg(&fold->stack[i]) ||
+ !is_spilled_reg(&fcur->stack[i]))
+ continue;
+
+ maybe_widen_reg(env,
+ &fold->stack[i].spilled_ptr,
+ &fcur->stack[i].spilled_ptr,
+ &env->idmap_scratch);
+ }
+ }
+ return 0;
+}
+
+static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
+ struct bpf_kfunc_call_arg_meta *meta)
{
- return base_type(type) == ARG_PTR_TO_MEM ||
- base_type(type) == ARG_PTR_TO_UNINIT_MEM;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ * i = 0;
+ * while(iter_next(&it))
+ * i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ * struct bpf_num_iter it;
+ * int arr[10];
+ * int i = 0, a = 0;
+ * bpf_iter_num_new(&it, 0, 10);
+ * while (bpf_iter_num_next(&it)) {
+ * if (a == 0) {
+ * a = 1;
+ * i = 7; // Because i changed verifier would forget
+ * // it's range on second loop entry.
+ * } else {
+ * arr[i] = 42; // This would fail to verify.
+ * }
+ * }
+ * bpf_iter_num_destroy(&it);
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = get_iter_from_state(cur_st, meta);
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verifier_bug(env, "unexpected iterator state %d (%s)",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* Because iter_next() call is a checkpoint is_state_visitied()
+ * should guarantee parent state with same call sites and insn_idx.
+ */
+ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+ !same_callsites(cur_st->parent, cur_st)) {
+ verifier_bug(env, "bad parent state for iter next call");
+ return -EFAULT;
+ }
+ /* Note cur_st->parent in the call below, it is necessary to skip
+ * checkpoint created for cur_st by is_state_visited()
+ * right at this instruction.
+ */
+ prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
+
+ queued_iter = get_iter_from_state(queued_st, meta);
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+ }
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
+
+ return 0;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -5041,25 +9139,20 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
-static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+static bool arg_type_is_raw_mem(enum bpf_arg_type type)
{
- return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+ return base_type(type) == ARG_PTR_TO_MEM &&
+ type & MEM_UNINIT;
}
-static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+static bool arg_type_is_release(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_INT ||
- type == ARG_PTR_TO_LONG;
+ return type & OBJ_RELEASE;
}
-static int int_ptr_type_to_size(enum bpf_arg_type type)
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
{
- if (type == ARG_PTR_TO_INT)
- return sizeof(u32);
- else if (type == ARG_PTR_TO_LONG)
- return sizeof(u64);
-
- return -EINVAL;
+ return base_type(type) == ARG_PTR_TO_DYNPTR;
}
static int resolve_map_arg_type(struct bpf_verifier_env *env,
@@ -5068,8 +9161,8 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
{
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose(env, "invalid map_ptr to access map->type\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->type");
+ return -EFAULT;
}
switch (meta->map_ptr->map_type) {
@@ -5097,16 +9190,6 @@ struct bpf_reg_types {
u32 *btf_id;
};
-static const struct bpf_reg_types map_key_value_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types sock_types = {
.types = {
PTR_TO_SOCK_COMMON,
@@ -5124,6 +9207,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
PTR_TO_TCP_SOCK,
PTR_TO_XDP_SOCK,
PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
@@ -5137,38 +9221,58 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_MEM | MEM_ALLOC,
+ PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
};
-static const struct bpf_reg_types int_ptr_types = {
+static const struct bpf_reg_types spin_lock_types = {
.types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
- },
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
};
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_xchg_dest_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC
+ }
+};
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ CONST_PTR_TO_DYNPTR,
+ }
+};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
- [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
@@ -5182,20 +9286,20 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
- [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
- [ARG_PTR_TO_INT] = &int_ptr_types,
- [ARG_PTR_TO_LONG] = &int_ptr_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
+ [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
- const u32 *arg_btf_id)
+ const u32 *arg_btf_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected, type = reg->type;
@@ -5204,7 +9308,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
- verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
+ verifier_bug(env, "unsupported arg type %d", arg_type);
return -EFAULT;
}
@@ -5216,12 +9320,22 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
* ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
* but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
*
+ * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+ *
* Therefore we fold these flags depending on the arg_type before comparison.
*/
if (arg_type & MEM_RDONLY)
type &= ~MEM_RDONLY;
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (base_type(arg_type) == ARG_PTR_TO_MEM)
+ type &= ~DYNPTR_TYPE_FLAG_MASK;
+
+ /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
+ type &= ~MEM_ALLOC;
+ type &= ~MEM_PERCPU;
+ }
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
@@ -5239,35 +9353,372 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID) {
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
+ /* For bpf_sk_release, it needs to match against first member
+ * 'struct sock_common', hence make an exception for it. This
+ * allows bpf_sk_release to work for multiple socket types.
+ */
+ bool strict_type_match = arg_type_is_release(arg_type) &&
+ meta->func_id != BPF_FUNC_sk_release;
+
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
- verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
+ verifier_bug(env, "missing arg compatible BTF ID");
return -EFAULT;
}
arg_btf_id = compatible->btf_id;
}
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
- return -EACCES;
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
+ verifier_bug(env, "unimplemented handling of MEM_ALLOC");
+ return -EFAULT;
+ }
+ /* Check if local kptr in src arg matches kptr in dst arg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
}
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type)
+{
+ u32 type = reg->type;
+
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
+ return -EINVAL;
+ }
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_RINGBUF:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_ARENA:
+ case SCALAR_VALUE:
+ return 0;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
+ */
+ return __check_ptr_off_reg(env, reg, regno, true);
+ default:
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+}
+
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
+
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
+static int check_reg_const_str(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
+
+ if (reg->type != PTR_TO_MAP_VALUE)
+ return -EINVAL;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
+ struct bpf_reg_state *key,
+ u32 key_size,
+ s64 *value)
+{
+ struct bpf_func_state *state = func(env, key);
+ struct bpf_reg_state *reg;
+ int slot, spi, off;
+ int spill_size = 0;
+ int zero_size = 0;
+ int stack_off;
+ int i, err;
+ u8 *stype;
+
+ if (!env->bpf_capable)
+ return -EOPNOTSUPP;
+ if (key->type != PTR_TO_STACK)
+ return -EOPNOTSUPP;
+ if (!tnum_is_const(key->var_off))
+ return -EOPNOTSUPP;
+
+ stack_off = key->off + key->var_off.value;
+ slot = -stack_off - 1;
+ spi = slot / BPF_REG_SIZE;
+ off = slot % BPF_REG_SIZE;
+ stype = state->stack[spi].slot_type;
+
+ /* First handle precisely tracked STACK_ZERO */
+ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
+ zero_size++;
+ if (zero_size >= key_size) {
+ *value = 0;
+ return 0;
}
+ /* Check that stack contains a scalar spill of expected size */
+ if (!is_spilled_scalar_reg(&state->stack[spi]))
+ return -EOPNOTSUPP;
+ for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
+ spill_size++;
+ if (spill_size != key_size)
+ return -EOPNOTSUPP;
+
+ reg = &state->stack[spi].spilled_ptr;
+ if (!tnum_is_const(reg->var_off))
+ /* Stack value not statically known */
+ return -EOPNOTSUPP;
+
+ /* We are relying on a constant value. So mark as precise
+ * to prevent pruning on it.
+ */
+ bt_set_frame_slot(&env->bt, key->frameno, spi);
+ err = mark_chain_precision_batch(env, env->cur_state);
+ if (err < 0)
+ return err;
+
+ *value = reg->var_off.value;
return 0;
}
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
- const struct bpf_func_proto *fn)
+ const struct bpf_func_proto *fn,
+ int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
+ u32 key_size;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -5292,8 +9743,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
- base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
@@ -5305,51 +9755,64 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
*/
goto skip_type_check;
- err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
if (err)
return err;
- switch ((u32)type) {
- case SCALAR_VALUE:
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_PACKET:
- case PTR_TO_PACKET_META:
- case PTR_TO_MAP_KEY:
- case PTR_TO_MAP_VALUE:
- case PTR_TO_MEM:
- case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
- case PTR_TO_BUF:
- case PTR_TO_BUF | MEM_RDONLY:
- case PTR_TO_STACK:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (arg_type == ARG_PTR_TO_ALLOC_MEM)
- goto force_off_check;
- break;
- /* All the rest must be rejected: */
- default:
-force_off_check:
- err = __check_ptr_off_reg(env, reg, regno,
- type == PTR_TO_BTF_ID);
- if (err < 0)
- return err;
- break;
- }
+ err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (err)
+ return err;
skip_type_check:
- if (reg->ref_obj_id) {
+ if (arg_type_is_release(arg_type)) {
+ if (arg_type_is_dynptr(arg_type)) {
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
+ return -EINVAL;
+ }
+ } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ verbose(env, "R%d must be referenced when passed to release function\n",
+ regno);
+ return -EINVAL;
+ }
+ if (meta->release_regno) {
+ verifier_bug(env, "more than one release argument");
+ return -EFAULT;
+ }
+ meta->release_regno = regno;
+ }
+
+ if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
- verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ verbose(env, "more than one arg with ref_obj_id R%d %u %u",
regno, reg->ref_obj_id,
meta->ref_obj_id);
- return -EFAULT;
+ return -EACCES;
}
meta->ref_obj_id = reg->ref_obj_id;
}
- if (arg_type == ARG_CONST_MAP_PTR) {
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
if (meta->map_ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
@@ -5374,7 +9837,8 @@ skip_type_check:
}
meta->map_ptr = reg->map_ptr;
meta->map_uid = reg->map_uid;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -5385,14 +9849,25 @@ skip_type_check:
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose(env, "invalid map_ptr to access map->key\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->key");
+ return -EFAULT;
+ }
+ key_size = meta->map_ptr->key_size;
+ err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ if (err)
+ return err;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
}
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
- NULL);
- } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
- base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
return 0;
@@ -5401,141 +9876,103 @@ skip_type_check:
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose(env, "invalid map_ptr to access map->value\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->value");
+ return -EFAULT;
}
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->value_size, false,
- meta);
- } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id;
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
+ err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
+ if (err)
+ return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
+ err = process_spin_lock(env, regno, 0);
+ if (err)
+ return err;
} else {
- verbose(env, "verifier internal error\n");
+ verifier_bug(env, "spin lock arg on unexpected helper");
return -EFAULT;
}
- } else if (arg_type == ARG_PTR_TO_TIMER) {
- if (process_timer_func(env, regno, meta))
- return -EACCES;
- } else if (arg_type == ARG_PTR_TO_FUNC) {
+ break;
+ case ARG_PTR_TO_TIMER:
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
- } else if (arg_type_is_mem_ptr(arg_type)) {
+ break;
+ case ARG_PTR_TO_MEM:
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
*/
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
- } else if (arg_type_is_mem_size(arg_type)) {
- bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
if (err)
return err;
+ if (arg_type & MEM_ALIGNED)
+ err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
}
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
- } else if (arg_type_is_alloc_size(arg_type)) {
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
+ err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
+ if (err)
+ return err;
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
meta->mem_size = reg->var_off.value;
- } else if (arg_type_is_int_ptr(arg_type)) {
- int size = int_ptr_type_to_size(arg_type);
-
- err = check_helper_mem_access(env, regno, size, false, meta);
+ err = mark_chain_precision(env, regno);
if (err)
return err;
- err = check_ptr_alignment(env, reg, 0, size, true);
- } else if (arg_type == ARG_PTR_TO_CONST_STR) {
- struct bpf_map *map = reg->map_ptr;
- int map_off;
- u64 map_addr;
- char *str_ptr;
-
- if (!bpf_map_is_rdonly(map)) {
- verbose(env, "R%d does not point to a readonly map'\n", regno);
- return -EACCES;
- }
-
- if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d is not a constant address'\n", regno);
- return -EACCES;
- }
-
- if (!map->ops->map_direct_value_addr) {
- verbose(env, "no direct value access support for this map type\n");
- return -EACCES;
- }
-
- err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false);
+ break;
+ case ARG_PTR_TO_CONST_STR:
+ {
+ err = check_reg_const_str(env, reg, regno);
if (err)
return err;
-
- map_off = reg->off + reg->var_off.value;
- err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
- if (err) {
- verbose(env, "direct value access on string failed\n");
+ break;
+ }
+ case ARG_KPTR_XCHG_DEST:
+ err = process_kptr_func(env, regno, meta);
+ if (err)
return err;
- }
-
- str_ptr = (char *)(long)(map_addr);
- if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
- verbose(env, "string is not zero-terminated\n");
- return -EINVAL;
- }
+ break;
}
return err;
@@ -5546,7 +9983,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
- if (func_id != BPF_FUNC_map_update_elem)
+ if (func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
return false;
/* It's not possible to get access to a locked struct sock in these
@@ -5557,6 +9995,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
if (eatype == BPF_TRACE_ITER)
return true;
break;
+ case BPF_PROG_TYPE_SOCK_OPS:
+ /* map_update allowed only via dedicated helpers with event type checks */
+ if (func_id == BPF_FUNC_map_delete_elem)
+ return true;
+ break;
case BPF_PROG_TYPE_SOCKET_FILTER:
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
@@ -5575,7 +10018,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
- return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -5601,7 +10045,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_query)
+ func_id != BPF_FUNC_ringbuf_query &&
+ func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+ func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+ func_id != BPF_FUNC_ringbuf_discard_dynptr)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -5644,7 +10095,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKMAP:
if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -5654,7 +10104,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKHASH:
if (func_id != BPF_FUNC_sk_redirect_hash &&
func_id != BPF_FUNC_sock_hash_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -5674,17 +10123,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
- func_id != BPF_FUNC_sk_storage_delete)
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_INODE_STORAGE:
if (func_id != BPF_FUNC_inode_storage_get &&
- func_id != BPF_FUNC_inode_storage_delete)
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_TASK_STORAGE:
if (func_id != BPF_FUNC_task_storage_get &&
- func_id != BPF_FUNC_task_storage_delete)
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_BLOOM_FILTER:
@@ -5692,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ goto error;
default:
break;
}
@@ -5702,7 +10162,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
- verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
return -EINVAL;
}
break;
@@ -5717,9 +10177,16 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_ringbuf_output:
case BPF_FUNC_ringbuf_reserve:
case BPF_FUNC_ringbuf_query:
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ case BPF_FUNC_ringbuf_discard_dynptr:
if (map->map_type != BPF_MAP_TYPE_RINGBUF)
goto error;
break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -5771,6 +10238,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
goto error;
break;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+ goto error;
+ break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -5786,6 +10259,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
goto error;
break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -5801,15 +10279,15 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg1_type))
count++;
- if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg2_type))
count++;
- if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg3_type))
count++;
- if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg4_type))
count++;
- if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg5_type))
count++;
/* We only support one arg being in raw mode at the moment,
@@ -5819,13 +10297,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
return count <= 1;
}
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
- enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
- return (arg_type_is_mem_ptr(arg_curr) &&
- !arg_type_is_mem_size(arg_next)) ||
- (!arg_type_is_mem_ptr(arg_curr) &&
- arg_type_is_mem_size(arg_next));
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -5836,52 +10320,29 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- arg_type_is_mem_ptr(fn->arg5_type) ||
- check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
- check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
- check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
- check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
return false;
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
-{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
-
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
-
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
-}
-
static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
- return false;
-
- if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
return false;
}
@@ -5892,38 +10353,24 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_btf_id_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(env, reg);
- }
-}
-
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
+ }));
}
enum {
@@ -5952,45 +10399,56 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
{
- struct bpf_reg_state *regs = state->regs, *reg;
int i;
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
+ if (state->refs[i].id == ref_obj_id) {
+ release_reference_state(state, i);
+ return 0;
+ }
}
+ return -EINVAL;
}
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
+ *
+ * This is the release function corresponding to acquire_reference(). Idempotent.
*/
-static int release_reference(struct bpf_verifier_env *env,
- int ref_obj_id)
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{
struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(vstate, ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
+ }));
return 0;
}
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ mark_reg_invalid(env, reg);
+ }));
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -5999,7 +10457,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -6008,15 +10466,16 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx, int subprog,
- set_callee_state_fn set_callee_state_cb)
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
- struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int err;
- bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -6024,104 +10483,318 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -E2BIG;
}
- caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
- verbose(env, "verifier bug. Frame %d already allocated\n",
- state->curframe + 1);
+ verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
return -EFAULT;
}
- func_info_aux = env->prog->aux->func_info_aux;
- if (func_info_aux)
- is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_subprog_arg_match(env, subprog, caller->regs);
- if (err == -EFAULT)
- return err;
- if (is_global) {
- if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n",
- subprog);
- return err;
- } else {
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env,
- "Func#%d is global and valid. Skipping.\n",
- subprog);
- clear_caller_saved_regs(env, caller->regs);
+ caller = state->frame[state->curframe];
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ callsite,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ err = set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
- /* continue with next insn after call */
- return 0;
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ const struct btf *btf,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_verifier_log *log = &env->log;
+ u32 i;
+ int ret;
+
+ ret = btf_prepare_func_args(env, subprog);
+ if (ret)
+ return ret;
+
+ /* check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < sub->arg_cnt; i++) {
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_subprog_arg_info *arg = &sub->args[i];
+
+ if (arg->arg_type == ARG_ANYTHING) {
+ if (reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type & PTR_UNTRUSTED) {
+ /*
+ * Anything is allowed for untrusted arguments, as these are
+ * read-only and probe read instructions would protect against
+ * invalid memory access.
+ */
+ } else if (arg->arg_type == ARG_PTR_TO_CTX) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ if (check_mem_reg(env, reg, regno, arg->mem_size))
+ return -EINVAL;
+ if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
+ bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /*
+ * Can pass any value and the kernel won't crash, but
+ * only PTR_TO_ARENA or SCALAR make sense. Everything
+ * else is a bug in the bpf program. Point it out to
+ * the user at the verification time instead of
+ * run-time debug nightmare.
+ */
+ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
+ ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+ if (ret)
+ return ret;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ continue;
+
+ memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+ err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
+ err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ if (err)
+ return err;
+ } else {
+ verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
+ return -EFAULT;
}
}
- if (insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_timer_set_callback) {
+ return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ err = btf_check_func_arg_match(env, subprog, btf, regs);
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_kfunc(insn->imm)) {
+ verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verifier_bug(env, "helper %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+
+ if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
- /* there is no real recursion here. timer callbacks are async */
+ /* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- *insn_idx, subprog);
- if (!async_cb)
- return -EFAULT;
+ insn_idx, subprog,
+ is_async_cb_sleepable(env, insn));
+ if (IS_ERR(async_cb))
+ return PTR_ERR(async_cb);
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
if (err)
return err;
- clear_caller_saved_regs(env, caller->regs);
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
- /* continue with next insn after call */
return 0;
}
- callee = kzalloc(sizeof(*callee), GFP_KERNEL);
- if (!callee)
- return -ENOMEM;
- state->frame[state->curframe + 1] = callee;
-
- /* callee cannot access r0, r6 - r9 for reading and has to write
- * into its own stack before reading from it.
- * callee can read/write into caller's stack
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
*/
- init_func_state(env, callee,
- /* remember the callsite, it will be used by bpf_exit */
- *insn_idx /* callsite */,
- state->curframe + 1 /* frameno within this callchain */,
- subprog /* subprog number within this prog */);
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (IS_ERR(callback_state))
+ return PTR_ERR(callback_state);
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
if (err)
return err;
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
+ target_insn))
+ return -EFAULT;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ const char *sub_name = subprog_name(env, subprog);
+
+ if (env->cur_state->active_locks) {
+ verbose(env, "global function calls are not allowed while holding a lock,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
+ if (env->subprog_info[subprog].might_sleep &&
+ (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
+ env->cur_state->active_irq_id || !in_sleepable(env))) {
+ verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
+ "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
+ "a non-sleepable BPF program context\n");
+ return -EINVAL;
+ }
+
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+ subprog, sub_name);
+ return err;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
+ /* mark global subprog for verifying after main prog */
+ subprog_aux(env, subprog)->called = true;
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
+ /* continue with next insn after call */
+ return 0;
+ }
+
+ /* for regular function entry setup new frame and continue
+ * from that frame.
+ */
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
if (err)
return err;
clear_caller_saved_regs(env, caller->regs);
- /* only increment it after check_reg_arg() finished */
- state->curframe++;
-
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
+ bpf_reset_live_stack_callchain(env);
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state, state->curframe, true);
}
+
return 0;
}
@@ -6166,22 +10839,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
return 0;
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
-{
- int subprog, target_insn;
-
- target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn);
- return -EFAULT;
- }
-
- return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -6191,12 +10848,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_map *map;
int err;
- if (bpf_map_ptr_poisoned(insn_aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ /* valid map_ptr and poison value does not matter */
+ map = insn_aux->map_ptr_state.map_ptr;
if (!map->ops->map_set_for_each_callback_args ||
!map->ops->map_for_each_callback) {
verbose(env, "callback function not allowed for map\n");
@@ -6208,6 +10861,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -6218,7 +10872,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
{
/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
* u64 flags);
- * callback_fn(u32 index, void *callback_ctx);
+ * callback_fn(u64 index, void *callback_ctx);
*/
callee->regs[BPF_REG_1].type = SCALAR_VALUE;
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
@@ -6229,6 +10883,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -6258,6 +10913,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 0);
return 0;
}
@@ -6276,7 +10932,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
callee->regs[BPF_REG_2].btf = btf_vmlinux;
- callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
/* pointer to stack or null */
callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
@@ -6285,16 +10941,141 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
-static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+ /*
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
+ bool return_32bit)
+{
+ if (return_32bit)
+ return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ else
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
+static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
struct bpf_func_state *caller, *callee;
struct bpf_reg_state *r0;
+ bool in_callback_fn;
int err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
+
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0];
if (r0->type == PTR_TO_STACK) {
@@ -6308,64 +11089,111 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
- state->curframe--;
- caller = state->frame[state->curframe];
+ caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
- /* enforce R0 return value range [0, 1]. */
- struct tnum range = tnum_range(0, 1);
-
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
return -EACCES;
}
- if (!tnum_in(range, r0->var_off)) {
- verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+
+ /* we are going to rely on register's precise value */
+ err = mark_chain_precision(env, BPF_REG_0);
+ if (err)
+ return err;
+
+ /* enforce R0 return value range, and bpf_callback_t returns 64bit */
+ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
+ verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+ "At callback return", "R0");
return -EINVAL;
}
+ if (!bpf_calls_callback(env, callee->callsite)) {
+ verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
} else {
/* return to the caller whatever r0 had in the callee */
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
- *insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state, callee->frameno, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
}
- /* clear everything in the callee */
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
+ state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
return 0;
}
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
- int func_id,
- struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
- if (ret_type != RET_INTEGER ||
- (func_id != BPF_FUNC_get_stack &&
- func_id != BPF_FUNC_get_task_stack &&
- func_id != BPF_FUNC_probe_read_str &&
- func_id != BPF_FUNC_probe_read_kernel_str &&
- func_id != BPF_FUNC_probe_read_user_str))
- return;
+ if (ret_type != RET_INTEGER)
+ return 0;
+
+ switch (func_id) {
+ case BPF_FUNC_get_stack:
+ case BPF_FUNC_get_task_stack:
+ case BPF_FUNC_probe_read_str:
+ case BPF_FUNC_probe_read_kernel_str:
+ case BPF_FUNC_probe_read_user_str:
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_smp_processor_id:
+ ret_reg->umax_value = nr_cpu_ids - 1;
+ ret_reg->u32_max_value = nr_cpu_ids - 1;
+ ret_reg->smax_value = nr_cpu_ids - 1;
+ ret_reg->s32_max_value = nr_cpu_ids - 1;
+ ret_reg->umin_value = 0;
+ ret_reg->u32_min_value = 0;
+ ret_reg->smin_value = 0;
+ ret_reg->s32_min_value = 0;
+ reg_bounds_sync(ret_reg);
+ break;
+ }
- ret_reg->smax_value = meta->msize_max_value;
- ret_reg->s32_max_value = meta->msize_max_value;
- ret_reg->smin_value = -MAX_ERRNO;
- ret_reg->s32_min_value = -MAX_ERRNO;
- __reg_deduce_bounds(ret_reg);
- __reg_bound_offset(ret_reg);
- __update_reg_bounds(ret_reg);
+ return reg_bounds_sanity_check(env, ret_reg, "retval");
}
static int
@@ -6383,12 +11211,13 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_pop_elem &&
func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_for_each_map_elem &&
- func_id != BPF_FUNC_redirect_map)
+ func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_percpu_elem)
return 0;
if (map == NULL) {
- verbose(env, "kernel subsystem misconfigured verifier\n");
- return -EINVAL;
+ verifier_bug(env, "expected map for helper call");
+ return -EFAULT;
}
/* In case of read-only, some additional restrictions
@@ -6404,12 +11233,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
- if (!BPF_MAP_PTR(aux->map_ptr_state))
+ if (!aux->map_ptr_state.map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1);
- else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
- bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- !meta->map_ptr->bypass_spec_v1);
+ !meta->map_ptr->bypass_spec_v1, true);
return 0;
}
@@ -6420,21 +11249,21 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
- struct tnum range;
- u64 val;
+ u64 val, max;
int err;
if (func_id != BPF_FUNC_tail_call)
return 0;
if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
- verbose(env, "kernel subsystem misconfigured verifier\n");
+ verbose(env, "expected prog array map for tail call");
return -EINVAL;
}
- range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
- if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ if (!(is_reg_const(reg, false) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -6442,8 +11271,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
err = mark_chain_precision(env, BPF_REG_3);
if (err)
return err;
-
- val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
@@ -6452,16 +11279,64 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
+ bool refs_lingering = false;
int i;
+ if (!exception_exit && cur_func(env)->frameno)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ /* Allow struct_ops programs to return a referenced kptr back to
+ * kernel. Type checks are performed later in check_return_code.
+ */
+ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
+ reg->ref_obj_id == state->refs[i].id)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
+}
+
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+ int err;
+
+ if (check_lock && env->cur_state->active_locks) {
+ verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ err = check_reference_leak(env, exception_exit);
+ if (err) {
+ verbose(env, "%s would lead to reference leak\n", prefix);
+ return err;
+ }
+
+ if (check_lock && env->cur_state->active_irq_id) {
+ verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_rcu_locks) {
+ verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_preempt_locks) {
+ verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ return 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -6470,6 +11345,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
int err, fmt_map_off, num_args;
u64 fmt_addr;
char *fmt;
@@ -6486,7 +11362,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
- verbose(env, "verifier bug\n");
+ verbose(env, "failed to retrieve map value address\n");
return -EFAULT;
}
fmt = (char *)(long)fmt_addr + fmt_map_off;
@@ -6494,7 +11370,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
* can focus on validating the format specifiers.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
if (err < 0)
verbose(env, "Invalid format string\n");
@@ -6522,9 +11398,83 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
+static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
+{
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
+ return -ERANGE;
+
+ if (!env->ops->get_func_proto)
+ return -EINVAL;
+
+ *ptr = env->ops->get_func_proto(func_id, env->prog);
+ return *ptr && (*ptr)->func ? 0 : -EINVAL;
+}
+
+/* Check if we're in a sleepable context. */
+static inline bool in_sleepable_context(struct bpf_verifier_env *env)
+{
+ return !env->cur_state->active_rcu_locks &&
+ !env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_irq_id &&
+ in_sleepable(env);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool returns_cpu_specific_alloc_ptr = false;
const struct bpf_func_proto *fn = NULL;
enum bpf_return_type ret_type;
enum bpf_type_flag ret_flag;
@@ -6536,18 +11486,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
- func_id);
+ err = get_helper_proto(env, insn->imm, &fn);
+ if (err == -ERANGE) {
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
- if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id, env->prog);
- if (!fn) {
- verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
- func_id);
- return -EINVAL;
+ if (err) {
+ verbose(env, "program of this type cannot use helper %s#%d\n",
+ func_id_name(func_id), func_id);
+ return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -6561,12 +11509,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (!in_sleepable(env) && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
- changes_data = bpf_helper_changes_pkt_data(fn->func);
+ changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
- verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
+ verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id);
+ return -EFAULT;
}
memset(&meta, 0, sizeof(meta));
@@ -6574,15 +11526,42 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = check_func_proto(fn, func_id);
if (err) {
- verbose(env, "kernel subsystem misconfigured func %s#%d\n",
- func_id_name(func_id), func_id);
+ verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
}
+ if (env->cur_state->active_rcu_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ if (env->cur_state->active_preempt_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ if (env->cur_state->active_irq_id) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ /* Track non-sleepable context for helpers. */
+ if (!in_sleepable_context(env))
+ env->insn_aux_data[insn_idx].non_sleepable = true;
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- err = check_func_arg(env, i, &meta, fn);
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -6600,13 +11579,45 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
*/
for (i = 0; i < meta.access_size; i++) {
err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
}
- if (is_release_function(func_id)) {
- err = release_reference(env, meta.ref_obj_id);
+ regs = cur_regs(env);
+
+ if (meta.release_regno) {
+ err = -EINVAL;
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+ u32 ref_obj_id = meta.ref_obj_id;
+ bool in_rcu = in_rcu_cs(env);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ err = release_reference_nomark(env->cur_state, ref_obj_id);
+ if (!err) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->ref_obj_id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ } else {
+ mark_reg_invalid(env, reg);
+ }
+ }
+ }));
+ }
+ } else if (meta.ref_obj_id) {
+ err = release_reference(env, meta.ref_obj_id);
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
+ err = 0;
+ }
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -6614,15 +11625,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
- regs = cur_regs(env);
-
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
+ err = check_resource_leak(env, false, true, "tail_call");
+ if (err)
return err;
- }
break;
case BPF_FUNC_get_local_storage:
/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -6634,23 +11641,135 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_for_each_map_elem:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_map_elem_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
break;
case BPF_FUNC_timer_set_callback:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_timer_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
break;
case BPF_FUNC_find_vma:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_find_vma_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
break;
case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_loop_callback_state);
+ update_loop_inline_state(env, meta.subprogno);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
+ break;
+ case BPF_FUNC_dynptr_from_mem:
+ if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+ verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+ reg_type_str(env, regs[BPF_REG_1].type));
+ return -EACCES;
+ }
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
+ case BPF_FUNC_dynptr_data:
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+
+ if (meta.dynptr_id) {
+ verifier_bug(env, "meta.dynptr_id already set");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verifier_bug(env, "meta.ref_obj_id already set");
+ return -EFAULT;
+ }
+
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verifier_bug(env, "failed to obtain dynptr id");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verifier_bug(env, "failed to obtain dynptr ref_obj_id");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB ||
+ dynptr_type == BPF_DYNPTR_TYPE_SKB_META)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
+ break;
+ }
+ case BPF_FUNC_per_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
+ {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1];
+ const struct btf_type *type;
+
+ if (reg->type & MEM_RCU) {
+ type = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!type || !btf_type_is_struct(type)) {
+ verbose(env, "Helper has invalid btf/btf_id in R1\n");
+ return -EFAULT;
+ }
+ returns_cpu_specific_alloc_ptr = true;
+ env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+ }
+ break;
+ }
+ case BPF_FUNC_user_ringbuf_drain:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
break;
}
@@ -6668,13 +11787,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* update return register (already marked as written above) */
ret_type = fn->ret_type;
- ret_flag = type_flag(fn->ret_type);
- if (ret_type == RET_INTEGER) {
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -6682,31 +11805,43 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose(env,
- "kernel subsystem misconfigured verifier\n");
- return -EINVAL;
+ verifier_bug(env, "unexpected null map_ptr");
+ return -EFAULT;
}
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ can_elide_value_nullness(meta.map_ptr->map_type) &&
+ meta.const_map_key >= 0 &&
+ meta.const_map_key < meta.map_ptr->max_entries)
+ ret_flag &= ~PTR_MAYBE_NULL;
+
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
- if (!type_may_be_null(ret_type) &&
- map_value_has_spin_lock(meta.map_ptr)) {
+ if (!type_may_be_null(ret_flag) &&
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
+ break;
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -6727,35 +11862,58 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- /* MEM_RDONLY may be carried from ret_flag, but it
- * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
- * it will confuse the check of PTR_TO_BTF_ID in
- * check_mem_access().
- */
- ret_flag &= ~MEM_RDONLY;
+ if (returns_cpu_specific_alloc_ptr) {
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+ } else {
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ }
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
+ struct btf *ret_btf;
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
- ret_btf_id = *fn->ret_btf_id;
+ if (func_id == BPF_FUNC_kptr_xchg) {
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf)) {
+ regs[BPF_REG_0].type |= MEM_ALLOC;
+ if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+ }
+ } else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type",
+ func_id_name(func_id));
+ return -EFAULT;
+ }
+ ret_btf = btf_vmlinux;
+ ret_btf_id = *fn->ret_btf_id;
+ }
if (ret_btf_id == 0) {
verbose(env, "invalid return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id),
func_id);
return -EINVAL;
}
- /* current BPF helper definitions are only coming from
- * built-in code with type IDs from vmlinux BTF
- */
- regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- } else {
+ break;
+ }
+ default:
verbose(env, "unknown return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
@@ -6764,11 +11922,20 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (is_ptr_cast_function(func_id)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -6778,7 +11945,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].ref_obj_id = id;
}
- do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
@@ -6813,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->prog->call_get_func_ip = true;
}
+ if (func_id == BPF_FUNC_tail_call) {
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ clear_all_pkt_pointers(env);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ } else {
+ changes_data = false;
+ }
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -6821,91 +12009,2270 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* mark_btf_func_reg_size() is used when the reg size is determined by
* the BTF func_proto's return value size and argument.
*/
-static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
- size_t reg_size)
+static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
+ u32 regno, size_t reg_size)
{
- struct bpf_reg_state *reg = &cur_regs(env)[regno];
+ struct bpf_reg_state *reg = &regs[regno];
if (regno == BPF_REG_0) {
/* Function return value */
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = reg_size == sizeof(u64) ?
DEF_NOT_SUBREG : env->insn_idx + 1;
- } else {
+ } else if (reg_size == sizeof(u64)) {
/* Function argument */
- if (reg_size == sizeof(u64)) {
- mark_insn_zext(env, reg);
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- } else {
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+ mark_insn_zext(env, reg);
+ }
+}
+
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size);
+}
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return btf_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return btf_param_match_suffix(btf, arg, "__szk");
+}
+
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__opt");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__map");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__nullable");
+}
+
+static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__str");
+}
+
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__irq_flag");
+}
+
+static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__prog");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
+ KF_ARG_WORKQUEUE_ID,
+ KF_ARG_RES_SPIN_LOCK_ID,
+ KF_ARG_TASK_WORK_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
+BTF_ID(struct, bpf_wq)
+BTF_ID(struct, bpf_res_spin_lock)
+BTF_ID(struct, bpf_task_work)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
+}
+
+static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID);
+}
+
+static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
+}
+
+static bool is_rbtree_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
+}
+
+static bool is_list_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
}
+ if (!btf_type_is_scalar(member_type))
+ return false;
}
+ return true;
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
+ KF_ARG_PTR_TO_NULL,
+ KF_ARG_PTR_TO_CONST_STR,
+ KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_WORKQUEUE,
+ KF_ARG_PTR_TO_IRQ_FLAG,
+ KF_ARG_PTR_TO_RES_SPIN_LOCK,
+ KF_ARG_PTR_TO_TASK_WORK,
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_list_front,
+ KF_bpf_list_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_first,
+ KF_bpf_rbtree_root,
+ KF_bpf_rbtree_left,
+ KF_bpf_rbtree_right,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_from_skb_meta,
+ KF_bpf_xdp_pull_data,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
+ KF_bpf_dynptr_clone,
+ KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
+ KF_bpf_wq_set_callback_impl,
+ KF_bpf_preempt_disable,
+ KF_bpf_preempt_enable,
+ KF_bpf_iter_css_task_new,
+ KF_bpf_session_cookie,
+ KF_bpf_get_kmem_cache,
+ KF_bpf_local_irq_save,
+ KF_bpf_local_irq_restore,
+ KF_bpf_iter_num_new,
+ KF_bpf_iter_num_next,
+ KF_bpf_iter_num_destroy,
+ KF_bpf_set_dentry_xattr,
+ KF_bpf_remove_dentry_xattr,
+ KF_bpf_res_spin_lock,
+ KF_bpf_res_spin_unlock,
+ KF_bpf_res_spin_lock_irqsave,
+ KF_bpf_res_spin_unlock_irqrestore,
+ KF_bpf_dynptr_from_file,
+ KF_bpf_dynptr_file_discard,
+ KF___bpf_trap,
+ KF_bpf_task_work_schedule_signal_impl,
+ KF_bpf_task_work_schedule_resume_impl,
+};
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_front)
+BTF_ID(func, bpf_list_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
+#ifdef CONFIG_NET
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_from_skb_meta)
+BTF_ID(func, bpf_xdp_pull_data)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#else
+BTF_ID_UNUSED
+#endif
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID(func, bpf_session_cookie)
+#else
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_set_dentry_xattr)
+BTF_ID(func, bpf_remove_dentry_xattr)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_res_spin_lock)
+BTF_ID(func, bpf_res_spin_unlock)
+BTF_ID(func, bpf_res_spin_lock_irqsave)
+BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, bpf_dynptr_from_file)
+BTF_ID(func, bpf_dynptr_file_discard)
+BTF_ID(func, __bpf_trap)
+BTF_ID(func, bpf_task_work_schedule_signal_impl)
+BTF_ID(func, bpf_task_work_schedule_resume_impl)
+
+static bool is_task_work_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ meta->arg_owning_ref) {
+ return false;
+ }
+
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
- const struct btf_type *t, *func, *func_proto, *ptr_type;
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
+static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
struct bpf_reg_state *regs = cur_regs(env);
- const char *func_name, *ptr_type_name;
- u32 i, nargs, func_id, ptr_type_id;
- struct module *btf_mod = NULL;
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_iter(meta, argno, &args[argno]))
+ return KF_ARG_PTR_TO_ITER;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
+ if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CONST_STR;
+
+ if (is_kfunc_arg_map(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_MAP;
+
+ if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_WORKQUEUE;
+
+ if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TASK_WORK;
+
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_IRQ_FLAG;
+
+ if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RES_SPIN_LOCK;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
+ strict_type_match = true;
+
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err, kfunc_class = IRQ_NATIVE_KFUNC;
+ bool irq_save;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) {
+ irq_save = true;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) {
+ irq_save = false;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else {
+ verifier_bug(env, "unknown irq flags kfunc");
+ return -EFAULT;
+ }
+
+ if (irq_save) {
+ if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+ verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ return -EINVAL;
+ }
+
+ err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class);
+ if (err)
+ return err;
+ } else {
+ err = is_irq_flag_reg_valid_init(env, reg);
+ if (err) {
+ verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ return err;
+ }
+
+ err = mark_irq_flag_read(env, reg);
+ if (err)
+ return err;
+
+ err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = reg_btf_record(reg);
+
+ if (!env->cur_state->active_locks) {
+ verifier_bug(env, "%s w/o active lock", __func__);
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verifier_bug(env, "NON_OWN_REF already set");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+ int i;
+
+ if (!ref_obj_id) {
+ verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
+ return -EFAULT;
+ }
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
+ return 0;
+ }
+
+ verifier_bug(env, "ref state missing for ref_obj_id");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_reference_state *s;
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ ptr = reg->btf;
+ break;
+ default:
+ verifier_bug(env, "unknown reg type for lock check");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_locks)
+ return -EINVAL;
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr);
+ if (!s) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_back];
+}
+
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_right];
+}
+
+static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
+}
+
+static bool kfunc_spin_allowed(u32 btf_id)
+{
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
+ is_bpf_res_spin_lock_kfunc(btf_id);
+}
+
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ is_task_work_add_kfunc(btf_id);
+}
+
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return is_sync_callback_calling_kfunc(btf_id) ||
+ is_async_callback_calling_kfunc(btf_id);
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 head_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
+ return -EINVAL;
+ }
+
+ if (*head_field) {
+ verifier_bug(env, "repeating %s arg", head_type_name);
+ return -EFAULT;
+ }
+ *head_field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ u32 node_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
+ return -EINVAL;
+ }
+
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
+ return -EINVAL;
+ }
+
+ field = *node_field;
+
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+}
+
+/*
+ * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
+ * LSM hooks and iters (both sleepable and non-sleepable) are safe.
+ * Any sleepable progs are also safe since bpf_check_attach_target() enforce
+ * them can only be attached to some specific hook points.
+ */
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ return true;
+ case BPF_PROG_TYPE_TRACING:
+ if (env->prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+ fallthrough;
+ default:
+ return in_sleepable(env);
+ }
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
const struct btf_param *args;
+ struct btf_record *rec;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (is_kfunc_arg_prog(btf, &args[i])) {
+ /* Used to reject repeated use of __prog. */
+ if (meta->arg_prog) {
+ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
+ return -EFAULT;
+ }
+ meta->arg_prog = true;
+ cur_aux(env)->arg_prog = regno;
+ continue;
+ }
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verifier_bug(env, "only one constant argument permitted");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
+ (register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_NULL:
+ continue;
+ case KF_ARG_PTR_TO_MAP:
+ if (!reg->map_ptr) {
+ verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ return -EINVAL;
+ }
+ if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
+ reg->map_ptr->record->task_work_off >= 0)) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * wq = bpf_map_lookup_elem(inner_map1);
+ * if (wq)
+ * // mismatch would have been allowed
+ * bpf_wq_init(wq, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
+ if (reg->map_ptr->record->task_work_off >= 0) {
+ verbose(env,
+ "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ verbose(env,
+ "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
+ fallthrough;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ case KF_ARG_PTR_TO_CONST_STR:
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TASK_WORK:
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ break;
+ default:
+ verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+ i, reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux) {
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+ int clone_ref_obj_id = 0;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ meta->release_regno = regno;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+
+ if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+ verifier_bug(env, "no dynptr type for parent of clone");
+ return -EFAULT;
+ }
+
+ dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+ clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+ if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+ verifier_bug(env, "missing ref obj id for parent of clone");
+ return -EFAULT;
+ }
+ }
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verifier_bug(env, "failed to obtain dynptr id");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+ if (!check_css_task_iter_allowlist(env)) {
+ verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
+ return -EINVAL;
+ }
+ }
+ ret = process_iter_arg(env, regno, insn_idx, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "%s not allowed in rbtree cb\n", func_name);
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MAP:
+ /* If argument has '__map' suffix expect 'struct bpf_map *' */
+ ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
+ ref_t = btf_type_by_id(btf_vmlinux, ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ fallthrough;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ {
+ struct bpf_reg_state *buff_reg = &regs[regno];
+ const struct btf_param *buff_arg = &args[i];
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ }
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verifier_bug(env, "only one constant argument permitted");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
+ i++;
+ break;
+ }
+ case KF_ARG_PTR_TO_CALLBACK:
+ if (reg->type != PTR_TO_FUNC) {
+ verbose(env, "arg%d expected pointer to func\n", i);
+ return -EINVAL;
+ }
+ meta->subprogno = reg->subprogno;
+ break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+ if (!type_is_non_owning_ref(reg->type))
+ meta->arg_owning_ref = true;
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verifier_bug(env, "Couldn't find btf_record");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ break;
+ case KF_ARG_PTR_TO_CONST_STR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a const string\n", i);
+ return -EINVAL;
+ }
+ ret = check_reg_const_str(env, reg, regno);
+ if (ret)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_wq_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_TASK_WORK:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_task_work_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ return -EINVAL;
+ }
+ ret = process_irq_flag(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ {
+ int flags = PROCESS_RES_LOCK;
+
+ if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+
+ if (!is_bpf_res_spin_lock_kfunc(meta->func_id))
+ return -EFAULT;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ flags |= PROCESS_SPIN_LOCK;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ flags |= PROCESS_LOCK_IRQ;
+ ret = process_spin_lock(env, regno, flags);
+ if (ret < 0)
+ return ret;
+ break;
+ }
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
+{
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
struct btf *desc_btf;
- int err;
- /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
if (!insn->imm)
- return 0;
+ return -EINVAL;
- desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+ desc_btf = find_kfunc_desc_btf(env, insn->off);
if (IS_ERR(desc_btf))
return PTR_ERR(desc_btf);
func_id = insn->imm;
func = btf_type_by_id(desc_btf, func_id);
func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id, btf_mod)) {
- verbose(env, "calling kernel function %s is not allowed\n",
- func_name);
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
+ if (!kfunc_flags) {
return -EACCES;
}
- /* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+/* check special kfuncs and return:
+ * 1 - not fall-through to 'else' branch, continue verification
+ * 0 - fall-through to 'else' branch
+ * < 0 - not fall-through to 'else' branch, return error
+ */
+static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
+ const struct btf_type *ptr_type, struct btf *desc_btf)
+{
+ const struct btf_type *ret_t;
+ int err = 0;
+
+ if (meta->btf != btf_vmlinux)
+ return 0;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta->arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta->arg_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta->arg_btf,
+ meta->arg_btf_id);
+ } else if (is_list_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (is_rbtree_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->ret_btf_id;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
+ if (!ret_t) {
+ verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n",
+ meta->arg_constant.value);
+ return -EINVAL;
+ } else if (btf_type_is_struct(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_constant.value;
+ } else if (btf_type_is_void(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ regs[BPF_REG_0].mem_size = 0;
+ } else {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n");
+ return -EINVAL;
+ }
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta->arg_constant.found) {
+ verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta->arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta->initialized_dynptr.id) {
+ verifier_bug(env, "no dynptr id");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ return 0;
+ }
+
+ return 1;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ const struct btf_type *t, *ptr_type;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
if (err)
return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
+ if (!insn->off &&
+ (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) {
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch)) {
+ verbose(env, "failed to push state for failed lock acquisition\n");
+ return PTR_ERR(branch);
+ }
+
+ regs = branch->frame[branch->curframe]->regs;
+
+ /* Clear r0-r5 registers in forked state */
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
+ if (err) {
+ verbose(env, "failed to mark s32 range for retval in forked state for lock\n");
+ return err;
+ }
+ __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
+ verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
+ return -EFAULT;
+ }
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !in_sleepable(env)) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ /* Track non-sleepable context for kfuncs, same as for helpers. */
+ if (!in_sleepable_context(env))
+ insn_aux->non_sleepable = true;
+
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
+ meta.r0_size = sizeof(u64);
+ meta.r0_rdonly = false;
+ }
+
+ if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (is_task_work_add_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_task_work_schedule_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+
+ preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+ preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
+ if (rcu_lock) {
+ env->cur_state->active_rcu_locks++;
+ } else if (rcu_unlock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
+
+ if (env->cur_state->active_rcu_locks == 0) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+ if (--env->cur_state->active_rcu_locks == 0) {
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ }
+ } else if (sleepable && env->cur_state->active_rcu_locks) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
+ if (env->cur_state->active_preempt_locks) {
+ if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ env->cur_state->active_preempt_locks--;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+ return -EACCES;
+ }
+ } else if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_irq_id && sleepable) {
+ verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ return -EACCES;
+ }
+
+ if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) {
+ verbose(env, "kernel func %s requires RCU critical section protection\n", func_name);
+ return -EACCES;
+ }
+
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
+ */
+ if (meta.release_regno) {
+ struct bpf_reg_state *reg = &regs[meta.release_regno];
+
+ if (meta.initialized_dynptr.ref_obj_id) {
+ err = unmark_stack_slots_dynptr(env, reg);
+ } else {
+ err = release_reference(env, reg->ref_obj_id);
+ if (err)
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ }
+ if (err)
+ return err;
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, meta.func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
+ func_name, meta.func_id);
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+
+ /* In the case of the default callback, the cookie value passed
+ * to bpf_throw becomes the return value of the program.
+ */
+ if (!env->exception_callback_subprog) {
+ err = check_return_code(env, BPF_REG_1, "R1");
+ if (err < 0)
+ return err;
+ }
+ }
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
+
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+ }
+
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
+ if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]))
+ __mark_reg_const_zero(env, &regs[BPF_REG_0]);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
- &ptr_type_id);
- if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(desc_btf,
- ptr_type->name_off);
- verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
- func_name, btf_type_str(ptr_type),
- ptr_type_name);
- return -EINVAL;
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+ err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
+ if (err) {
+ if (err < 0)
+ return err;
+ } else if (btf_type_is_void(ptr_type)) {
+ /* kfunc returning 'void *' is equivalent to returning scalar */
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ } else if (!__btf_type_is_struct(ptr_type)) {
+ if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
+ if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
+ regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
+
+ if (is_iter_next_kfunc(&meta)) {
+ struct bpf_reg_state *cur_iter;
+
+ cur_iter = get_iter_from_state(env->cur_state, &meta);
+
+ if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
+ regs[BPF_REG_0].type |= MEM_RCU;
+ else
+ regs[BPF_REG_0].type |= PTR_TRUSTED;
+ }
+ }
+
+ if (is_kfunc_ret_null(&meta)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
- } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+ if (is_kfunc_acquire(&meta)) {
+ int id = acquire_reference(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
+ }
- nargs = btf_type_vlen(func_proto);
- args = (const struct btf_param *)(func_proto + 1);
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
+ }
+ }
+ }
+
+ if (is_kfunc_pkt_changing(&meta))
+ clear_all_pkt_pointers(env);
+
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
@@ -6917,47 +14284,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_btf_func_reg_size(env, regno, t->size);
}
- return 0;
-}
-
-static bool signed_add_overflows(s64 a, s64 b)
-{
- /* Do the add in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a + (u64)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add32_overflows(s32 a, s32 b)
-{
- /* Do the add in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a + (u32)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_sub_overflows(s64 a, s64 b)
-{
- /* Do the sub in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a - (u64)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
-static bool signed_sub32_overflows(s32 a, s32 b)
-{
- /* Do the sub in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a - (u32)b);
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
- if (b < 0)
- return res < a;
- return res > a;
+ return 0;
}
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
@@ -6995,11 +14328,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
- return &env->insn_aux_data[env->insn_idx];
-}
-
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -7042,7 +14370,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
- return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
+ return env->bypass_spec_v1 ||
+ BPF_SRC(insn->code) == BPF_K ||
+ cur_aux(env)->nospec;
}
static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -7083,16 +14413,15 @@ struct bpf_sanitize_info {
bool mask_to_left;
};
-static struct bpf_verifier_state *
-sanitize_speculative_path(struct bpf_verifier_env *env,
- const struct bpf_insn *insn,
- u32 next_idx, u32 curr_idx)
+static int sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
{
struct bpf_verifier_state *branch;
struct bpf_reg_state *regs;
branch = push_stack(env, next_idx, curr_idx, true);
- if (branch && insn) {
+ if (!IS_ERR(branch) && insn) {
regs = branch->frame[branch->curframe]->regs;
if (BPF_SRC(insn->code) == BPF_K) {
mark_reg_unknown(env, regs, insn->dst_reg);
@@ -7101,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env,
mark_reg_unknown(env, regs, insn->src_reg);
}
}
- return branch;
+ return PTR_ERR_OR_ZERO(branch);
}
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
@@ -7120,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
- bool ret;
int err;
if (can_skip_alu_sanitation(env, insn))
@@ -7191,13 +14519,14 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
- ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
- env->insn_idx);
- if (!ptr_is_dst_reg && ret)
+ err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
+ if (err < 0)
+ return REASON_STACK;
+ if (!ptr_is_dst_reg)
*dst_reg = tmp;
- return !ret ? REASON_STACK : 0;
+ return 0;
}
static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
@@ -7242,10 +14571,9 @@ static int sanitize_err(struct bpf_verifier_env *env,
case REASON_STACK:
verbose(env, "R%d could not be pushed for speculative verification, %s\n",
dst, err);
- break;
+ return -ENOMEM;
default:
- verbose(env, "verifier internal error: unknown reason (%d)\n",
- reason);
+ verifier_bug(env, "unknown reason (%d)", reason);
break;
}
@@ -7305,14 +14633,14 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
}
break;
default:
- break;
+ return -EOPNOTSUPP;
}
return 0;
@@ -7339,7 +14667,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg;
- int ret;
+ int ret, bounds_ret;
dst_reg = &regs[dst];
@@ -7371,22 +14699,40 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to track offsets.
+ */
+ if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED))
+ return 0;
+
switch (base_type(ptr_reg->type)) {
+ case PTR_TO_CTX:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ case PTR_TO_TP_BUFFER:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_FUNC:
+ case CONST_PTR_TO_DYNPTR:
+ break;
+ case PTR_TO_FLOW_KEYS:
+ if (known)
+ break;
+ fallthrough;
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
break;
fallthrough;
- case PTR_TO_PACKET_END:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
+ default:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
- default:
- break;
}
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
@@ -7435,21 +14781,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (signed_add_overflows(smin_ptr, smin_val) ||
- signed_add_overflows(smax_ptr, smax_val)) {
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr + smin_val;
- dst_reg->smax_value = smax_ptr + smax_val;
}
- if (umin_ptr + umin_val < umin_ptr ||
- umax_ptr + umax_val < umax_ptr) {
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value = umin_ptr + umin_val;
- dst_reg->umax_value = umax_ptr + umax_val;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
@@ -7492,14 +14832,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
- if (signed_sub_overflows(smin_ptr, smax_val) ||
- signed_sub_overflows(smax_ptr, smin_val)) {
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr - smax_val;
- dst_reg->smax_value = smax_ptr - smin_val;
}
if (umin_ptr < umax_val) {
/* Overflow possible, we know nothing */
@@ -7536,16 +14873,20 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
-
- if (sanitize_check_bounds(env, insn, dst_reg) < 0)
- return -EACCES;
+ reg_bounds_sync(dst_reg);
+ bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
+ if (bounds_ret == -EACCES)
+ return bounds_ret;
if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
&info, true);
+ if (verifier_bug_if(!can_skip_alu_sanitation(env, insn)
+ && !env->cur_state->speculative
+ && bounds_ret
+ && !ret,
+ env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) {
+ return -EFAULT;
+ }
if (ret < 0)
return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
@@ -7556,172 +14897,182 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
+ bool min_overflow, max_overflow;
- if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
- signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value += smin_val;
- dst_reg->s32_max_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value + umin_val < umin_val ||
- dst_reg->u32_max_value + umax_val < umax_val) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value += umin_val;
- dst_reg->u32_max_value += umax_val;
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
+ bool min_overflow, max_overflow;
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value += smin_val;
- dst_reg->smax_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value + umin_val < umin_val ||
- dst_reg->umax_value + umax_val < umax_val) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value += umin_val;
- dst_reg->umax_value += umax_val;
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
+ bool min_underflow, max_underflow;
- if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
- signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value -= smax_val;
- dst_reg->s32_max_value -= smin_val;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value < umax_val) {
- /* Overflow possible, we know nothing */
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- /* Cannot overflow (as long as bounds are consistent) */
- dst_reg->u32_min_value -= umax_val;
- dst_reg->u32_max_value -= umin_val;
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
+ bool min_underflow, max_underflow;
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value -= smax_val;
- dst_reg->smax_value -= smin_val;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value < umax_val) {
- /* Overflow possible, we know nothing */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- /* Cannot overflow (as long as bounds are consistent) */
- dst_reg->umin_value -= umax_val;
- dst_reg->umax_value -= umin_val;
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 tmp_prod[4];
- if (smin_val < 0 || dst_reg->s32_min_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg32_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S32_MAX).
- */
- if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg32_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
- dst_reg->u32_min_value *= umin_val;
- dst_reg->u32_max_value *= umax_val;
- if (dst_reg->u32_max_value > S32_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
} else {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ s64 tmp_prod[4];
- if (smin_val < 0 || dst_reg->smin_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg64_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S64_MAX).
- */
- if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg64_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
- dst_reg->umin_value *= umin_val;
- dst_reg->umax_value *= umax_val;
- if (dst_reg->umax_value > S64_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
} else {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
@@ -7731,7 +15082,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (src_known && dst_known) {
@@ -7744,18 +15094,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -7764,7 +15112,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
@@ -7777,18 +15124,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -7800,7 +15145,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
if (src_known && dst_known) {
@@ -7813,18 +15157,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -7833,7 +15175,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
@@ -7846,18 +15187,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -7869,7 +15208,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -7880,10 +15218,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u32 result into s32.
- */
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
@@ -7897,7 +15235,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
if (src_known && dst_known) {
/* dst_reg->var_off.value has been updated earlier */
@@ -7909,10 +15246,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u64 result into s64.
- */
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
@@ -8120,6 +15457,47 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
+ const struct bpf_reg_state *src_reg)
+{
+ bool src_is_const = false;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+
+ if (insn_bitness == 32) {
+ if (tnum_subreg_is_const(src_reg->var_off)
+ && src_reg->s32_min_value == src_reg->s32_max_value
+ && src_reg->u32_min_value == src_reg->u32_max_value)
+ src_is_const = true;
+ } else {
+ if (tnum_is_const(src_reg->var_off)
+ && src_reg->smin_value == src_reg->smax_value
+ && src_reg->umin_value == src_reg->umax_value)
+ src_is_const = true;
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_NEG:
+ case BPF_AND:
+ case BPF_XOR:
+ case BPF_OR:
+ case BPF_MUL:
+ return true;
+
+ /* Shift operators range is only computable if shift dimension operand
+ * is a constant. Shifts greater than 31 or 63 are undefined. This
+ * includes shifts by a negative number.
+ */
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ return (src_is_const && src_reg->umax_value < insn_bitness);
+ default:
+ return false;
+ }
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -8129,53 +15507,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known;
- s64 smin_val, smax_val;
- u64 umin_val, umax_val;
- s32 s32_min_val, s32_max_val;
- u32 u32_min_val, u32_max_val;
- u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
- smin_val = src_reg.smin_value;
- smax_val = src_reg.smax_value;
- umin_val = src_reg.umin_value;
- umax_val = src_reg.umax_value;
-
- s32_min_val = src_reg.s32_min_value;
- s32_max_val = src_reg.s32_max_value;
- u32_min_val = src_reg.u32_min_value;
- u32_max_val = src_reg.u32_max_value;
-
- if (alu32) {
- src_known = tnum_subreg_is_const(src_reg.var_off);
- if ((src_known &&
- (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
- s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- } else {
- src_known = tnum_is_const(src_reg.var_off);
- if ((src_known &&
- (smin_val != smax_val || umin_val != umax_val)) ||
- smin_val > smax_val || umin_val > umax_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- }
-
- if (!src_known &&
- opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
__mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -8211,6 +15547,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
+ case BPF_NEG:
+ env->fake_reg[0] = *dst_reg;
+ __mark_reg_known(dst_reg, 0);
+ scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
+ scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
+ dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off);
+ break;
case BPF_MUL:
dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_mul(dst_reg, &src_reg);
@@ -8232,56 +15575,31 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_xor(dst_reg, &src_reg);
break;
case BPF_LSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_lsh(dst_reg, &src_reg);
else
scalar_min_max_lsh(dst_reg, &src_reg);
break;
case BPF_RSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_rsh(dst_reg, &src_reg);
else
scalar_min_max_rsh(dst_reg, &src_reg);
break;
case BPF_ARSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_arsh(dst_reg, &src_reg);
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
default:
- mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
return 0;
}
@@ -8295,18 +15613,30 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code);
int err;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
+
+ if (dst_reg->type == PTR_TO_ARENA) {
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /*
+ * 32-bit operations zero upper bits automatically.
+ * 64-bit operations need to be converted to 32.
+ */
+ aux->needs_zext = true;
+
+ /* Any arithmetic operations are allowed on arena pointers */
+ return 0;
+ }
+
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
- else
- /* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
- */
- dst_reg->id = 0;
+
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -8341,6 +15671,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -8356,16 +15691,53 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
- return -EINVAL;
+ return -EFAULT;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n");
- return -EINVAL;
+ return -EFAULT;
}
- return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So for 64-bit alu remember constant delta between r2 and r1 and
+ * update r1 after 'if' condition.
+ */
+ if (env->bpf_capable &&
+ BPF_OP(insn->code) == BPF_ADD && !alu32 &&
+ dst_reg->id && is_reg_const(src_reg, false)) {
+ u64 val = reg_const_value(src_reg, false);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in sync_linked_regs() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by sync_linked_regs()
+ */
+ dst_reg->id = 0;
+ }
+ return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
@@ -8377,7 +15749,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
verbose(env, "BPF_NEG uses reserved fields\n");
@@ -8386,7 +15758,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- BPF_CLASS(insn->code) == BPF_ALU64) {
+ (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_SRC(insn->code) != BPF_TO_LE)) {
verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -8404,16 +15777,42 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ if (opcode == BPF_NEG &&
+ regs[insn->dst_reg].type == SCALAR_VALUE) {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_scalar_min_max_vals(env, insn,
+ &regs[insn->dst_reg],
+ regs[insn->dst_reg]);
+ } else {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ }
if (err)
return err;
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
+ if (BPF_CLASS(insn->code) == BPF_ALU) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
+ if (!env->prog->aux->arena) {
+ verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
+ return -EINVAL;
+ }
+ } else {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) || insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
}
/* check src operand */
@@ -8437,18 +15836,43 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- /* case: R1 = R2
- * copy register state to dest reg
- */
- if (src_reg->type == SCALAR_VALUE && !src_reg->id)
- /* Assign src and dst registers the same ID
- * that will be used by find_equal_scalars()
- * to propagate min/max range.
+ if (insn->imm) {
+ /* off == BPF_ADDR_SPACE_CAST */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ if (insn->imm == 1) { /* cast from as(1) to as(0) */
+ dst_reg->type = PTR_TO_ARENA;
+ /* PTR_TO_ARENA is 32-bit */
+ dst_reg->subreg_def = env->insn_idx + 1;
+ }
+ } else if (insn->off == 0) {
+ /* case: R1 = R2
+ * copy register state to dest reg
*/
- src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = DEF_NOT_SUBREG;
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ /* case: R1 = (s8, s16 s32)R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d sign-extension part of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ bool no_sext;
+
+ no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ }
+ }
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -8457,23 +15881,37 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
- /* Make sure ID is cleared otherwise
- * dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
- */
- dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = env->insn_idx + 1;
+ if (insn->off == 0) {
+ bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;
+
+ if (is_src_reg_u32)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32
+ * range otherwise dst_reg min/max could be incorrectly
+ * propagated into src_reg by sync_linked_regs()
+ */
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ } else {
+ /* case: W1 = (s8, s16)W2 */
+ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
+ }
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
}
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
@@ -8498,7 +15936,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -8507,7 +15946,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -8536,35 +15976,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* check dest operand */
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_reg_min_max_vals(env, insn);
if (err)
return err;
-
- return adjust_reg_min_max_vals(env, insn);
}
- return 0;
-}
-
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, int new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
- }
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
+ return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
}
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
@@ -8572,7 +15989,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
enum bpf_reg_type type,
bool range_right_open)
{
- int new_range, i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -8637,150 +16056,165 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
-static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
-{
- struct tnum subreg = tnum_subreg(reg->var_off);
- s32 sval = (s32)val;
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+ struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+ u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+ u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+ s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+ s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+ u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
+ u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
+ s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
+ s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+
+ if (reg1 == reg2) {
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JLE:
+ case BPF_JSGE:
+ case BPF_JSLE:
+ case BPF_JEQ:
+ return 1;
+ case BPF_JGT:
+ case BPF_JLT:
+ case BPF_JSGT:
+ case BPF_JSLT:
+ case BPF_JNE:
+ return 0;
+ case BPF_JSET:
+ if (tnum_is_const(t1))
+ return t1.value != 0;
+ else
+ return (smin1 <= 0 && smax1 >= 0) ? -1 : 1;
+ default:
+ return -1;
+ }
+ }
switch (opcode) {
case BPF_JEQ:
- if (tnum_is_const(subreg))
- return !!tnum_equals_const(subreg, val);
- break;
- case BPF_JNE:
- if (tnum_is_const(subreg))
- return !tnum_equals_const(subreg, val);
- break;
- case BPF_JSET:
- if ((~subreg.mask & subreg.value) & val)
- return 1;
- if (!((subreg.mask | subreg.value) & val))
- return 0;
- break;
- case BPF_JGT:
- if (reg->u32_min_value > val)
- return 1;
- else if (reg->u32_max_value <= val)
- return 0;
- break;
- case BPF_JSGT:
- if (reg->s32_min_value > sval)
- return 1;
- else if (reg->s32_max_value <= sval)
- return 0;
- break;
- case BPF_JLT:
- if (reg->u32_max_value < val)
- return 1;
- else if (reg->u32_min_value >= val)
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value == t2.value;
+ if (!tnum_overlap(t1, t2))
return 0;
- break;
- case BPF_JSLT:
- if (reg->s32_max_value < sval)
- return 1;
- else if (reg->s32_min_value >= sval)
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
return 0;
- break;
- case BPF_JGE:
- if (reg->u32_min_value >= val)
- return 1;
- else if (reg->u32_max_value < val)
+ if (smin1 > smax2 || smax1 < smin2)
return 0;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 0;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 0;
+ }
break;
- case BPF_JSGE:
- if (reg->s32_min_value >= sval)
+ case BPF_JNE:
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value != t2.value;
+ if (!tnum_overlap(t1, t2))
return 1;
- else if (reg->s32_max_value < sval)
- return 0;
- break;
- case BPF_JLE:
- if (reg->u32_max_value <= val)
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
return 1;
- else if (reg->u32_min_value > val)
- return 0;
- break;
- case BPF_JSLE:
- if (reg->s32_max_value <= sval)
+ if (smin1 > smax2 || smax1 < smin2)
return 1;
- else if (reg->s32_min_value > sval)
- return 0;
- break;
- }
-
- return -1;
-}
-
-
-static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
-{
- s64 sval = (s64)val;
-
- switch (opcode) {
- case BPF_JEQ:
- if (tnum_is_const(reg->var_off))
- return !!tnum_equals_const(reg->var_off, val);
- break;
- case BPF_JNE:
- if (tnum_is_const(reg->var_off))
- return !tnum_equals_const(reg->var_off, val);
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 1;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 1;
+ }
break;
case BPF_JSET:
- if ((~reg->var_off.mask & reg->var_off.value) & val)
+ if (!is_reg_const(reg2, is_jmp32)) {
+ swap(reg1, reg2);
+ swap(t1, t2);
+ }
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+ if ((~t1.mask & t1.value) & t2.value)
return 1;
- if (!((reg->var_off.mask | reg->var_off.value) & val))
+ if (!((t1.mask | t1.value) & t2.value))
return 0;
break;
case BPF_JGT:
- if (reg->umin_value > val)
+ if (umin1 > umax2)
return 1;
- else if (reg->umax_value <= val)
+ else if (umax1 <= umin2)
return 0;
break;
case BPF_JSGT:
- if (reg->smin_value > sval)
+ if (smin1 > smax2)
return 1;
- else if (reg->smax_value <= sval)
+ else if (smax1 <= smin2)
return 0;
break;
case BPF_JLT:
- if (reg->umax_value < val)
+ if (umax1 < umin2)
return 1;
- else if (reg->umin_value >= val)
+ else if (umin1 >= umax2)
return 0;
break;
case BPF_JSLT:
- if (reg->smax_value < sval)
+ if (smax1 < smin2)
return 1;
- else if (reg->smin_value >= sval)
+ else if (smin1 >= smax2)
return 0;
break;
case BPF_JGE:
- if (reg->umin_value >= val)
+ if (umin1 >= umax2)
return 1;
- else if (reg->umax_value < val)
+ else if (umax1 < umin2)
return 0;
break;
case BPF_JSGE:
- if (reg->smin_value >= sval)
+ if (smin1 >= smax2)
return 1;
- else if (reg->smax_value < sval)
+ else if (smax1 < smin2)
return 0;
break;
case BPF_JLE:
- if (reg->umax_value <= val)
+ if (umax1 <= umin2)
return 1;
- else if (reg->umin_value > val)
+ else if (umin1 > umax2)
return 0;
break;
case BPF_JSLE:
- if (reg->smax_value <= sval)
+ if (smax1 <= smin2)
return 1;
- else if (reg->smin_value > sval)
+ else if (smin1 > smax2)
return 0;
break;
}
@@ -8788,41 +16222,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return -1;
}
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
- * and return:
- * 1 - branch will be taken and "goto target" will be executed
- * 0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
- * range [0,10]
- */
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
- bool is_jmp32)
-{
- if (__is_pointer_value(false, reg)) {
- if (!reg_type_not_null(reg->type))
- return -1;
-
- /* If pointer is valid tests against zero will fail so we can
- * use this to direct branch taken.
- */
- if (val != 0)
- return -1;
-
- switch (opcode) {
- case BPF_JEQ:
- return 0;
- case BPF_JNE:
- return 1;
- default:
- return -1;
- }
- }
-
- if (is_jmp32)
- return is_branch32_taken(reg, val, opcode);
- return is_branch64_taken(reg, val, opcode);
-}
-
static int flip_opcode(u32 opcode)
{
/* How can we transform "a <op> b" into "b <op> a"? */
@@ -8884,222 +16283,295 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
return -1;
}
-/* Adjusts the register min/max values in the case that the dst_reg is the
- * variable register that we are working on, and src_reg is a constant or we're
- * simply doing a BPF_K check.
- * In JEQ/JNE cases we also adjust the var_off values.
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
+ * and return:
+ * 1 - branch will be taken and "goto target" will be executed
+ * 0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
+ * range [0,10]
*/
-static void reg_set_min_max(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
-{
- struct tnum false_32off = tnum_subreg(false_reg->var_off);
- struct tnum false_64off = false_reg->var_off;
- struct tnum true_32off = tnum_subreg(true_reg->var_off);
- struct tnum true_64off = true_reg->var_off;
- s64 sval = (s64)val;
- s32 sval32 = (s32)val32;
-
- /* If the dst_reg is a pointer, we can't learn anything about its
- * variable offset from the compare (unless src_reg were a pointer into
- * the same object, but we don't bother with that.
- * Since false_reg and true_reg have the same type by construction, we
- * only need to check one of them for pointerness.
- */
- if (__is_pointer_value(false, false_reg))
- return;
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+ return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+ if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
+ u64 val;
+
+ /* arrange that reg2 is a scalar, and reg1 is a pointer */
+ if (!is_reg_const(reg2, is_jmp32)) {
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ }
+ /* and ensure that reg2 is a constant */
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+
+ if (!reg_not_null(reg1))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ val = reg_const_value(reg2, is_jmp32);
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
+
+ /* now deal with two scalars, but not necessarily constants */
+ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
+}
+
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
+ */
+static u8 rev_opcode(u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ: return BPF_JNE;
+ case BPF_JNE: return BPF_JEQ;
+ /* JSET doesn't have it's reverse opcode in BPF, so add
+ * BPF_X flag to denote the reverse of that operation
+ */
+ case BPF_JSET: return BPF_JSET | BPF_X;
+ case BPF_JSET | BPF_X: return BPF_JSET;
+ case BPF_JGE: return BPF_JLT;
+ case BPF_JGT: return BPF_JLE;
+ case BPF_JLE: return BPF_JGT;
+ case BPF_JLT: return BPF_JGE;
+ case BPF_JSGE: return BPF_JSLT;
+ case BPF_JSGT: return BPF_JSLE;
+ case BPF_JSLE: return BPF_JSGT;
+ case BPF_JSLT: return BPF_JSGE;
+ default: return 0;
+ }
+}
+
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t;
+ u64 val;
+
+ /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ break;
+ default:
+ break;
+ }
switch (opcode) {
case BPF_JEQ:
+ if (is_jmp32) {
+ reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->u32_min_value = reg1->u32_min_value;
+ reg2->u32_max_value = reg1->u32_max_value;
+ reg2->s32_min_value = reg1->s32_min_value;
+ reg2->s32_max_value = reg1->s32_max_value;
+
+ t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ reg2->var_off = tnum_with_subreg(reg2->var_off, t);
+ } else {
+ reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->umin_value = reg1->umin_value;
+ reg2->umax_value = reg1->umax_value;
+ reg2->smin_value = reg1->smin_value;
+ reg2->smax_value = reg1->smax_value;
+
+ reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+ reg2->var_off = reg1->var_off;
+ }
+ break;
case BPF_JNE:
- {
- struct bpf_reg_state *reg =
- opcode == BPF_JEQ ? true_reg : false_reg;
-
- /* JEQ/JNE comparison doesn't change the register equivalence.
- * r1 = r2;
- * if (r1 == 42) goto label;
- * ...
- * label: // here both r1 and r2 are known to be 42.
- *
- * Hence when marking register as known preserve it's ID.
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+
+ /* try to recompute the bound of reg1 if reg2 is a const and
+ * is exactly the edge of reg1.
*/
- if (is_jmp32)
- __mark_reg32_known(reg, val32);
- else
- ___mark_reg_known(reg, val);
+ val = reg_const_value(reg2, is_jmp32);
+ if (is_jmp32) {
+ /* u32_min_value is not equal to 0xffffffff at this point,
+ * because otherwise u32_max_value is 0xffffffff as well,
+ * in such a case both reg1 and reg2 would be constants,
+ * jump would be predicted and reg_set_min_max() won't
+ * be called.
+ *
+ * Same reasoning works for all {u,s}{min,max}{32,64} cases
+ * below.
+ */
+ if (reg1->u32_min_value == (u32)val)
+ reg1->u32_min_value++;
+ if (reg1->u32_max_value == (u32)val)
+ reg1->u32_max_value--;
+ if (reg1->s32_min_value == (s32)val)
+ reg1->s32_min_value++;
+ if (reg1->s32_max_value == (s32)val)
+ reg1->s32_max_value--;
+ } else {
+ if (reg1->umin_value == (u64)val)
+ reg1->umin_value++;
+ if (reg1->umax_value == (u64)val)
+ reg1->umax_value--;
+ if (reg1->smin_value == (s64)val)
+ reg1->smin_value++;
+ if (reg1->smax_value == (s64)val)
+ reg1->smax_value--;
+ }
break;
- }
case BPF_JSET:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+ * requires single bit to learn something useful. E.g., if we
+ * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+ * are actually set? We can learn something definite only if
+ * it's a single-bit value to begin with.
+ *
+ * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+ * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+ * bit 1 is set, which we can readily use in adjustments.
+ */
+ if (!is_power_of_2(val))
+ break;
if (is_jmp32) {
- false_32off = tnum_and(false_32off, tnum_const(~val32));
- if (is_power_of_2(val32))
- true_32off = tnum_or(true_32off,
- tnum_const(val32));
+ t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- false_64off = tnum_and(false_64off, tnum_const(~val));
- if (is_power_of_2(val))
- true_64off = tnum_or(true_64off,
- tnum_const(val));
+ reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
}
break;
- case BPF_JGE:
- case BPF_JGT:
- {
+ case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* Forget the ranges before narrowing tnums, to avoid invariant
+ * violations if we're on a dead branch.
+ */
+ __mark_reg_unbounded(reg1);
if (is_jmp32) {
- u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1;
- u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
-
- false_reg->u32_max_value = min(false_reg->u32_max_value,
- false_umax);
- true_reg->u32_min_value = max(true_reg->u32_min_value,
- true_umin);
+ t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- u64 false_umax = opcode == BPF_JGT ? val : val - 1;
- u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
-
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
}
break;
- }
- case BPF_JSGE:
- case BPF_JSGT:
- {
+ case BPF_JLE:
if (is_jmp32) {
- s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1;
- s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
-
- false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
- true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
} else {
- s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
- s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
-
- false_reg->smax_value = min(false_reg->smax_value, false_smax);
- true_reg->smin_value = max(true_reg->smin_value, true_smin);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
}
break;
- }
- case BPF_JLE:
case BPF_JLT:
- {
if (is_jmp32) {
- u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1;
- u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
-
- false_reg->u32_min_value = max(false_reg->u32_min_value,
- false_umin);
- true_reg->u32_max_value = min(true_reg->u32_max_value,
- true_umax);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+ reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
} else {
- u64 false_umin = opcode == BPF_JLT ? val : val + 1;
- u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
-
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+ reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
}
break;
- }
case BPF_JSLE:
+ if (is_jmp32) {
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ } else {
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+ }
+ break;
case BPF_JSLT:
- {
if (is_jmp32) {
- s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1;
- s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
-
- false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
- true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+ reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
} else {
- s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
- s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
-
- false_reg->smin_value = max(false_reg->smin_value, false_smin);
- true_reg->smax_value = min(true_reg->smax_value, true_smax);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+ reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- }
default:
return;
}
-
- if (is_jmp32) {
- false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
- tnum_subreg(false_32off));
- true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
- tnum_subreg(true_32off));
- __reg_combine_32_into_64(false_reg);
- __reg_combine_32_into_64(true_reg);
- } else {
- false_reg->var_off = false_64off;
- true_reg->var_off = true_64off;
- __reg_combine_64_into_32(false_reg);
- __reg_combine_64_into_32(true_reg);
- }
}
-/* Same as above, but for the case that dst_reg holds a constant and src_reg is
- * the variable reg.
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
*/
-static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
+static int reg_set_min_max(struct bpf_verifier_env *env,
+ struct bpf_reg_state *true_reg1,
+ struct bpf_reg_state *true_reg2,
+ struct bpf_reg_state *false_reg1,
+ struct bpf_reg_state *false_reg2,
+ u8 opcode, bool is_jmp32)
{
- opcode = flip_opcode(opcode);
- /* This uses zero as "not present in table"; luckily the zero opcode,
- * BPF_JA, can't get here.
+ int err;
+
+ /* If either register is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless they were a pointer into
+ * the same object, but we don't bother with that).
*/
- if (opcode)
- reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
-}
+ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+ return 0;
-/* Regs are known to be equal, so intersect their min/max/var_off */
-static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
- struct bpf_reg_state *dst_reg)
-{
- src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
- dst_reg->umin_value);
- src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
- dst_reg->umax_value);
- src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
- dst_reg->smin_value);
- src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
- dst_reg->smax_value);
- src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
- dst_reg->var_off);
- /* We might have learned new bounds from the var_off. */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
- /* We might have learned something about the sign bit. */
- __reg_deduce_bounds(src_reg);
- __reg_deduce_bounds(dst_reg);
- /* We might have learned some bits from the bounds. */
- __reg_bound_offset(src_reg);
- __reg_bound_offset(dst_reg);
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
+ /* We compute branch direction for same SCALAR_VALUE registers in
+ * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
+ * on the same registers, we don't need to adjust the min/max values.
*/
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
-}
+ if (false_reg1 == false_reg2)
+ return 0;
-static void reg_combine_min_max(struct bpf_reg_state *true_src,
- struct bpf_reg_state *true_dst,
- struct bpf_reg_state *false_src,
- struct bpf_reg_state *false_dst,
- u8 opcode)
-{
- switch (opcode) {
- case BPF_JEQ:
- __reg_combine_min_max(true_src, true_dst);
- break;
- case BPF_JNE:
- __reg_combine_min_max(false_src, false_dst);
- break;
- }
+ /* fallthrough (FALSE) branch */
+ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(false_reg1);
+ reg_bounds_sync(false_reg2);
+
+ /* jump (TRUE) branch */
+ regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(true_reg1);
+ reg_bounds_sync(true_reg2);
+
+ err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ return err;
}
static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -9107,17 +16579,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
bool is_null)
{
if (type_may_be_null(reg->type) && reg->id == id &&
- !WARN_ON_ONCE(!reg->id)) {
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL. If we
- * see this happening, don't convert the register.
- */
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
return;
- }
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
+ return;
+
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -9134,7 +16611,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -9144,22 +16621,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -9167,20 +16628,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
* No one could have freed the reference state before
* doing the NULL check.
*/
- WARN_ON_ONCE(release_reference_state(state, id));
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -9288,26 +16749,95 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
-static void find_equal_scalars(struct bpf_verifier_state *vstate,
- struct bpf_reg_state *known_reg)
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+ u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
{
- struct bpf_func_state *state;
+ struct linked_reg *e;
+
+ if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+ return;
+
+ e = linked_regs_push(reg_set);
+ if (e) {
+ e->frameno = frameno;
+ e->is_reg = is_reg;
+ e->regno = spi_or_reg;
+ } else {
+ reg->id = 0;
+ }
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_func_state *func;
struct bpf_reg_state *reg;
int i, j;
- for (i = 0; i <= vstate->curframe; i++) {
- state = vstate->frame[i];
- for (j = 0; j < MAX_BPF_REG; j++) {
- reg = &state->regs[j];
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ id = id & ~BPF_ADD_CONST;
+ for (i = vstate->curframe; i >= 0; i--) {
+ func = vstate->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ __collect_linked_regs(linked_regs, reg, id, i, j, true);
}
-
- bpf_for_each_spilled_reg(j, state, reg) {
- if (!reg)
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
continue;
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ reg = &func->stack[j].spilled_ptr;
+ __collect_linked_regs(linked_regs, reg, id, i, j, false);
+ }
+ }
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_reg_state fake_reg;
+ struct bpf_reg_state *reg;
+ struct linked_reg *e;
+ int i;
+
+ for (i = 0; i < linked_regs->cnt; ++i) {
+ e = &linked_regs->entries[i];
+ reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+ : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
+ s32 saved_subreg_def = reg->subreg_def;
+
+ copy_register_state(reg, known_reg);
+ reg->subreg_def = saved_subreg_def;
+ } else {
+ s32 saved_subreg_def = reg->subreg_def;
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
+ copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another sync_linked_regs() will be incorrect.
+ */
+ reg->off = saved_off;
+ reg->subreg_def = saved_subreg_def;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
}
}
}
@@ -9319,17 +16849,50 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
+ struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
bool is_jmp32;
int pred = -1;
int err;
/* Only conditional jumps are expected to reach here. */
- if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ if (opcode == BPF_JA || opcode > BPF_JCOND) {
verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
+ if (opcode == BPF_JCOND) {
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ int idx = *insn_idx;
+
+ if (insn->code != (BPF_JMP | BPF_JCOND) ||
+ insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
+ return -EINVAL;
+ }
+ prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+ /* branch out 'fallthrough' insn as a new state to explore */
+ queued_st = push_stack(env, idx + 1, idx, false);
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
+
+ queued_st->may_goto_depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+ *insn_idx += insn->off;
+ return 0;
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -9341,47 +16904,40 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- if (is_pointer_value(env, insn->src_reg)) {
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
- src_reg = &regs[insn->src_reg];
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
- }
-
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
+ src_reg = &env->fake_reg[0];
+ memset(src_reg, 0, sizeof(*src_reg));
+ src_reg->type = SCALAR_VALUE;
+ __mark_reg_known(src_reg, insn->imm);
- dst_reg = &regs[insn->dst_reg];
- is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
+ }
- if (BPF_SRC(insn->code) == BPF_K) {
- pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
- pred = is_branch_taken(dst_reg,
- tnum_subreg(src_reg->var_off).value,
- opcode,
- is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- !is_jmp32 && tnum_is_const(src_reg->var_off)) {
- pred = is_branch_taken(dst_reg,
- src_reg->var_off.value,
- opcode,
- is_jmp32);
- } else if (reg_is_pkt_pointer_any(dst_reg) &&
- reg_is_pkt_pointer_any(src_reg) &&
- !is_jmp32) {
- pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
+ if (insn_flags) {
+ err = push_jmp_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
}
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
+ pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
@@ -9400,10 +16956,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* the fall-through branch for simulation under speculative
* execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn, *insn_idx + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx);
+ if (err < 0)
+ return err;
+ }
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -9411,71 +16970,105 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* program will go. If needed, push the goto branch for
* simulation under speculative execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn,
- *insn_idx + insn->off + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1,
+ *insn_idx);
+ if (err < 0)
+ return err;
+ }
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
- false);
- if (!other_branch)
- return -EFAULT;
+ /* Push scalar registers sharing same ID to jump history,
+ * do this before creating 'other_branch', so that both
+ * 'this_branch' and 'other_branch' share this history
+ * if parent state is created.
+ */
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+ collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+ collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ if (linked_regs.cnt > 1) {
+ err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ if (err)
+ return err;
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- /* detect if we are comparing against a constant value so we can adjust
- * our min/max values for our dst register.
- * this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
- * comparable.
- */
if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (dst_reg->type == SCALAR_VALUE &&
- src_reg->type == SCALAR_VALUE) {
- if (tnum_is_const(src_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(src_reg->var_off))))
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg,
- src_reg->var_off.value,
- tnum_subreg(src_reg->var_off).value,
- opcode, is_jmp32);
- else if (tnum_is_const(dst_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(dst_reg->var_off))))
- reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- src_reg,
- dst_reg->var_off.value,
- tnum_subreg(dst_reg->var_off).value,
- opcode, is_jmp32);
- else if (!is_jmp32 &&
- (opcode == BPF_JEQ || opcode == BPF_JNE))
- /* Comparing for equality, we can combine knowledge */
- reg_combine_min_max(&other_branch_regs[insn->src_reg],
- &other_branch_regs[insn->dst_reg],
- src_reg, dst_reg, opcode);
- if (src_reg->id &&
- !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- find_equal_scalars(this_branch, src_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
- }
-
- }
- } else if (dst_reg->type == SCALAR_VALUE) {
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, (u32)insn->imm,
- opcode, is_jmp32);
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &other_branch_regs[insn->src_reg],
+ dst_reg, src_reg, opcode, is_jmp32);
+ } else /* BPF_SRC(insn->code) == BPF_K */ {
+ /* reg_set_min_max() can mangle the fake_reg. Make a copy
+ * so that these are two different memory locations. The
+ * src_reg is not used beyond here in context of K.
+ */
+ memcpy(&env->fake_reg[1], &env->fake_reg[0],
+ sizeof(env->fake_reg[0]));
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &env->fake_reg[0],
+ dst_reg, &env->fake_reg[1],
+ opcode, is_jmp32);
}
+ if (err)
+ return err;
+ if (BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == SCALAR_VALUE && src_reg->id &&
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+ sync_linked_regs(this_branch, src_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
+ }
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- find_equal_scalars(this_branch, dst_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ sync_linked_regs(this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
+ }
+
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
@@ -9500,7 +17093,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
@@ -9548,12 +17141,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
- case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
default:
- verbose(env, "bpf verifier is misconfigured\n");
+ verifier_bug(env, "pseudo btf id: unexpected dst reg type");
return -EFAULT;
}
return 0;
@@ -9583,16 +17175,21 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- if (map_value_has_spin_lock(map))
- dst_reg->id = ++env->id_gen;
+ WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
+ map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
} else {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "unexpected src reg value for ldimm64");
+ return -EFAULT;
}
return 0;
@@ -9638,8 +17235,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
if (!env->ops->gen_ld_abs) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "gen_ld_abs is null");
+ return -EFAULT;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
@@ -9658,16 +17255,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+ err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
+ if (err)
return err;
- }
-
- if (env->cur_state->active_spin_lock) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
- return -EINVAL;
- }
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
@@ -9702,23 +17292,52 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env)
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
+ const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
- struct bpf_reg_state *reg;
- struct tnum range = tnum_range(0, 1);
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
+ bool return_32bit = false;
+ const struct btf_type *reg_type, *ret_type = NULL;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog &&
- (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
- prog_type == BPF_PROG_TYPE_LSM) &&
- !prog->aux->attach_func_proto->type)
- return 0;
+ if (!is_subprog || frame->in_exception_callback_fn) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+
+ if (frame->in_exception_callback_fn)
+ break;
+
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
+ break;
+ default:
+ break;
+ }
+ }
/* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
@@ -9726,36 +17345,25 @@ static int check_return_code(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, regno, SRC_OP);
if (err)
return err;
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
return -EACCES;
}
- reg = cur_regs(env) + BPF_REG_0;
-
if (frame->in_async_callback_fn) {
- /* enforce return zero from async callbacks like timer */
- if (reg->type != SCALAR_VALUE) {
- verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
- return -EINVAL;
- }
-
- if (!tnum_in(tnum_const(0), reg->var_off)) {
- verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
- return -EINVAL;
- }
- return 0;
+ exit_ctx = "At async callback return";
+ range = frame->callback_ret_range;
+ goto enforce_retval;
}
- if (is_subprog) {
+ if (is_subprog && !frame->in_exception_callback_fn) {
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+ regno, reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -9765,18 +17373,21 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
- range = tnum_range(1, 1);
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
+ range = retval_range(1, 1);
if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
break;
@@ -9789,13 +17400,13 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
return 0;
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
@@ -9806,8 +17417,44 @@ static int check_return_code(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_KPROBE:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_KPROBE_SESSION:
+ case BPF_TRACE_UPROBE_SESSION:
+ range = retval_range(0, 1);
+ break;
+ default:
+ return 0;
+ }
+ break;
case BPF_PROG_TYPE_SK_LOOKUP:
- range = tnum_range(SK_DROP, SK_PASS);
+ range = retval_range(SK_DROP, SK_PASS);
+ break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* no range found, any return value is allowed */
+ if (!get_func_retval_range(env->prog, &range))
+ return 0;
+ /* no restricted range, any return value is allowed */
+ if (range.minval == S32_MIN && range.maxval == S32_MAX)
+ return 0;
+ return_32bit = true;
+ } else if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = retval_range(1, 1);
+ }
+ break;
+
+ case BPF_PROG_TYPE_NETFILTER:
+ range = retval_range(NF_DROP, NF_ACCEPT);
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!ret_type)
+ return 0;
+ range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
@@ -9817,14 +17464,24 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
+enforce_retval:
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "%s the register R%d is not a known value (%s)\n",
+ exit_ctx, regno, reg_type_str(env, reg->type));
return -EINVAL;
}
- if (!tnum_in(range, reg->var_off)) {
- verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+
+ if (!retval_range_within(range, reg, return_32bit)) {
+ verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+ if (!is_subprog &&
+ prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
@@ -9834,13 +17491,45 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -9874,24 +17563,34 @@ enum {
BRANCH = 2,
};
-static u32 state_htab_size(struct bpf_verifier_env *env)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
- return env->prog->len;
+ env->insn_aux_data[idx].prune_point = true;
}
-static struct bpf_verifier_state_list **explored_state(
- struct bpf_verifier_env *env,
- int idx)
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_verifier_state *cur = env->cur_state;
- struct bpf_func_state *state = cur->frame[cur->curframe];
+ return env->insn_aux_data[insn_idx].prune_point;
+}
- return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
{
- env->insn_aux_data[idx].prune_point = true;
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
}
enum {
@@ -9904,8 +17603,7 @@ enum {
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
- bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
int *insn_stack = env->cfg.insn_stack;
int *insn_state = env->cfg.insn_state;
@@ -9922,9 +17620,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -9935,7 +17635,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
insn_stack[env->cfg.cur_stack++] = w;
return KEEP_EXPLORING;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->bpf_capable)
+ if (env->bpf_capable)
return DONE_EXPLORING;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
@@ -9945,101 +17645,659 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose(env, "insn state internal bug\n");
+ verifier_bug(env, "insn state internal bug");
return -EFAULT;
}
return DONE_EXPLORING;
}
-static int visit_func_call_insn(int t, int insn_cnt,
- struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
- int ret;
+ int ret, insn_sz;
+ int w;
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
if (ret)
return ret;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + insn_sz);
+
if (visit_callee) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
- /* It's ok to allow recursion from CFG point of
- * view. __check_func_call() will do the actual
- * check.
- */
- bpf_pseudo_func(insns + t));
+ w = t + insns[t].imm + 1;
+ mark_prune_point(env, t);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
}
return ret;
}
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* True if do_misc_fixups() replaces calls to helper number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+{
+ switch (imm) {
+#ifdef CONFIG_X86_64
+ case BPF_FUNC_get_smp_processor_id:
+ return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
+#endif
+ default:
+ return false;
+ }
+}
+
+struct call_summary {
+ u8 num_params;
+ bool is_void;
+ bool fastcall;
+};
+
+/* If @call is a kfunc or helper call, fills @cs and returns true,
+ * otherwise returns false.
+ */
+static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct call_summary *cs)
+{
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct bpf_func_proto *fn;
+ int i;
+
+ if (bpf_helper_call(call)) {
+
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return false;
+ cs->fastcall = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ cs->is_void = fn->ret_type == RET_VOID;
+ cs->num_params = 0;
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) {
+ if (fn->arg_type[i] == ARG_DONTCARE)
+ break;
+ cs->num_params++;
+ }
+ return true;
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return false;
+ cs->num_params = btf_type_vlen(meta.func_proto);
+ cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
+ cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
+ return true;
+ }
+
+ return false;
+}
+
+/* LLVM define a bpf_fastcall function attribute.
+ * This attribute means that function scratches only some of
+ * the caller saved registers defined by ABI.
+ * For BPF the set of such registers could be defined as follows:
+ * - R0 is scratched only if function is non-void;
+ * - R1-R5 are scratched only if corresponding parameter type is defined
+ * in the function prototype.
+ *
+ * The contract between kernel and clang allows to simultaneously use
+ * such functions and maintain backwards compatibility with old
+ * kernels that don't understand bpf_fastcall calls:
+ *
+ * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
+ * registers are not scratched by the call;
+ *
+ * - as a post-processing step, clang visits each bpf_fastcall call and adds
+ * spill/fill for every live r0-r5;
+ *
+ * - stack offsets used for the spill/fill are allocated as lowest
+ * stack offsets in whole function and are not used for any other
+ * purposes;
+ *
+ * - when kernel loads a program, it looks for such patterns
+ * (bpf_fastcall function surrounded by spills/fills) and checks if
+ * spill/fill stack offsets are used exclusively in fastcall patterns;
+ *
+ * - if so, and if verifier or current JIT inlines the call to the
+ * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
+ * spill/fill pairs;
+ *
+ * - when old kernel loads a program, presence of spill/fill pairs
+ * keeps BPF program valid, albeit slightly less efficient.
+ *
+ * For example:
+ *
+ * r1 = 1;
+ * r2 = 2;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * *(u64 *)(r10 - 16) = r2; r2 = 2;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r2 = *(u64 *)(r10 - 16); r0 = r1;
+ * r1 = *(u64 *)(r10 - 8); r0 += r2;
+ * r0 = r1; exit;
+ * r0 += r2;
+ * exit;
+ *
+ * The purpose of mark_fastcall_pattern_for_call is to:
+ * - look for such patterns;
+ * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
+ * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
+ * - update env->subprog_info[*]->fastcall_stack_off to find an offset
+ * at which bpf_fastcall spill/fill stack slots start;
+ * - update env->subprog_info[*]->keep_fastcall_stack.
+ *
+ * The .fastcall_pattern and .fastcall_stack_off are used by
+ * check_fastcall_stack_contract() to check if every stack access to
+ * fastcall spill/fill stack slot originates from spill/fill
+ * instructions, members of fastcall patterns.
+ *
+ * If such condition holds true for a subprogram, fastcall patterns could
+ * be rewritten by remove_fastcall_spills_fills().
+ * Otherwise bpf_fastcall patterns are not changed in the subprogram
+ * (code, presumably, generated by an older clang version).
+ *
+ * For example, it is *not* safe to remove spill/fill below:
+ *
+ * r1 = 1;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!!
+ * r0 = *(u64 *)(r10 - 8); r0 += r1;
+ * r0 += r1; exit;
+ * exit;
+ */
+static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
+ struct bpf_subprog_info *subprog,
+ int insn_idx, s16 lowest_off)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
+ struct bpf_insn *call = &env->prog->insnsi[insn_idx];
+ u32 clobbered_regs_mask;
+ struct call_summary cs;
+ u32 expected_regs_mask;
+ s16 off;
+ int i;
+
+ if (!get_call_summary(env, call, &cs))
+ return;
+
+ /* A bitmask specifying which caller saved registers are clobbered
+ * by a call to a helper/kfunc *as if* this helper/kfunc follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+ clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0);
+ /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
+ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
+
+ /* match pairs of form:
+ *
+ * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0)
+ * ...
+ * call %[to_be_inlined]
+ * ...
+ * rX = *(u64 *)(r10 - Y)
+ */
+ for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
+ if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
+ break;
+ stx = &insns[insn_idx - i];
+ ldx = &insns[insn_idx + i];
+ /* must be a stack spill/fill pair */
+ if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
+ stx->dst_reg != BPF_REG_10 ||
+ ldx->src_reg != BPF_REG_10)
+ break;
+ /* must be a spill/fill for the same reg */
+ if (stx->src_reg != ldx->dst_reg)
+ break;
+ /* must be one of the previously unseen registers */
+ if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
+ break;
+ /* must be a spill/fill for the same expected offset,
+ * no need to check offset alignment, BPF_DW stack access
+ * is always 8-byte aligned.
+ */
+ if (stx->off != off || ldx->off != off)
+ break;
+ expected_regs_mask &= ~BIT(stx->src_reg);
+ env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
+ env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
+ }
+ if (i == 1)
+ return;
+
+ /* Conditionally set 'fastcall_spills_num' to allow forward
+ * compatibility when more helper functions are marked as
+ * bpf_fastcall at compile time than current kernel supports, e.g:
+ *
+ * 1: *(u64 *)(r10 - 8) = r1
+ * 2: call A ;; assume A is bpf_fastcall for current kernel
+ * 3: r1 = *(u64 *)(r10 - 8)
+ * 4: *(u64 *)(r10 - 8) = r1
+ * 5: call B ;; assume B is not bpf_fastcall for current kernel
+ * 6: r1 = *(u64 *)(r10 - 8)
+ *
+ * There is no need to block bpf_fastcall rewrite for such program.
+ * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
+ * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
+ * does not remove spill/fill pair {4,6}.
+ */
+ if (cs.fastcall)
+ env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
+ else
+ subprog->keep_fastcall_stack = 1;
+ subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
+}
+
+static int mark_fastcall_patterns(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn;
+ s16 lowest_off;
+ int s, i;
+
+ for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
+ /* find lowest stack spill offset used in this subprog */
+ lowest_off = 0;
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->dst_reg != BPF_REG_10)
+ continue;
+ lowest_off = min(lowest_off, insn->off);
+ }
+ /* use this offset to find fastcall patterns */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+ mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
+ }
+ }
+ return 0;
+}
+
+static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ static struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
+{
+ static struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
- int ret;
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret, off, insn_sz;
- if (bpf_pseudo_func(insns + t))
- return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insns[t].code) != BPF_JMP &&
- BPF_CLASS(insns[t].code) != BPF_JMP32)
- return push_insn(t, t + 1, FALLTHROUGH, env, false);
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
- switch (BPF_OP(insns[t].code)) {
+ switch (BPF_OP(insn->code)) {
case BPF_EXIT:
return DONE_EXPLORING;
case BPF_CALL:
- if (insns[t].imm == BPF_FUNC_timer_set_callback)
- /* Mark this call insn to trigger is_state_visited() check
- * before call itself is processed by __check_func_call().
- * Otherwise new async state will be pushed for further
- * exploration.
+ if (is_async_callback_calling_insn(insn))
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
*/
- init_explored_state(env, t);
- return visit_func_call_insn(t, insn_cnt, insns, env,
- insns[t].src_reg == BPF_PSEUDO_CALL);
+ mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call)
+ visit_tailcall_insn(env, t);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
+ if (ret == 0 && is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insns[t].code) != BPF_K)
- return -EINVAL;
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
+
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
/* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
- true);
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
if (ret)
return ret;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
return ret;
default:
/* conditional jump with two edges */
- init_explored_state(env, t);
- ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+ mark_prune_point(env, t);
+ if (is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret)
return ret;
- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
}
}
@@ -10050,27 +18308,31 @@ static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
- int ret = 0;
- int i;
+ int ex_insn_beg, i, ret = 0;
- insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_state)
return -ENOMEM;
- insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
+walk_cfg:
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
- ret = visit_insn(t, insn_cnt, env);
+ ret = visit_insn(t, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
@@ -10080,7 +18342,7 @@ static int check_cfg(struct bpf_verifier_env *env)
break;
default:
if (ret > 0) {
- verbose(env, "visit_insn internal bug\n");
+ verifier_bug(env, "visit_insn internal bug");
ret = -EFAULT;
}
goto err_free;
@@ -10088,19 +18350,38 @@ static int check_cfg(struct bpf_verifier_env *env)
}
if (env->cfg.cur_stack < 0) {
- verbose(env, "pop stack internal bug\n");
+ verifier_bug(env, "pop stack internal bug");
ret = -EFAULT;
goto err_free;
}
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ goto walk_cfg;
+ }
+
for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ i++; /* skip second half of ldimm64 */
+ }
}
ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
err_free:
kvfree(insn_state);
@@ -10109,6 +18390,57 @@ err_free:
return ret;
}
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+static int compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s;
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
+
+ postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
static int check_abnormal_return(struct bpf_verifier_env *env)
{
int i;
@@ -10130,20 +18462,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
#define MIN_BPF_FUNCINFO_SIZE 8
#define MAX_FUNCINFO_REC_SIZE 252
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
- const struct btf_type *type, *func_proto, *ret_type;
- u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
struct bpf_prog *prog;
const struct btf *btf;
- bpfptr_t urecord;
u32 prev_offset = 0;
- bool scalar_return;
+ bpfptr_t urecord;
int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
@@ -10153,11 +18483,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
return 0;
}
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
- }
-
urec_size = attr->func_info_rec_size;
if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
urec_size > MAX_FUNCINFO_REC_SIZE ||
@@ -10172,12 +18497,9 @@ static int check_btf_func(struct bpf_verifier_env *env,
urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
min_size = min_t(u32, krec_size, urec_size);
- krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
- info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
- if (!info_aux)
- goto err_free;
for (i = 0; i < nfuncs; i++) {
ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -10216,11 +18538,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- goto err_free;
- }
-
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
if (!type || !btf_type_is_func(type)) {
@@ -10228,15 +18545,80 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord[i].type_id);
goto err_free;
}
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
func_proto = btf_type_by_id(btf, type->type);
if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
/* btf_func_check() already verified it during BTF load */
goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
- btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
goto err_free;
@@ -10246,17 +18628,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- prev_offset = krecord[i].insn_off;
bpfptr_add(&urecord, urec_size);
}
- prog->aux->func_info = krecord;
- prog->aux->func_info_cnt = nfuncs;
prog->aux->func_info_aux = info_aux;
return 0;
err_free:
- kvfree(krecord);
kfree(info_aux);
return ret;
}
@@ -10269,12 +18647,12 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
if (!aux->func_info)
return;
- for (i = 0; i < env->subprog_cnt; i++)
+ /* func_info is not available for hidden subprogs */
+ for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
@@ -10305,7 +18683,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
* pass in a smaller bpf_line_info object.
*/
linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
- GFP_KERNEL | __GFP_NOWARN);
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!linfo)
return -ENOMEM;
@@ -10474,9 +18852,9 @@ static int check_core_relo(struct bpf_verifier_env *env,
return err;
}
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
@@ -10496,6 +18874,24 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
env->prog->aux->btf = btf;
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
err = check_btf_func(env, attr, uattr);
if (err)
return err;
@@ -10512,8 +18908,8 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
/* check %cur's range satisfies %old's */
-static bool range_within(struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
{
return old->umin_value <= cur->umin_value &&
old->umax_value >= cur->umax_value &&
@@ -10535,36 +18931,57 @@ static bool range_within(struct bpf_reg_state *old,
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
+ struct bpf_id_pair *map = idmap->map;
unsigned int i;
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!idmap[i].old) {
+ if (!map[i].old) {
/* Reached an empty slot; haven't seen this id before */
- idmap[i].old = old_id;
- idmap[i].cur = cur_id;
+ map[i].old = old_id;
+ map[i].cur = cur_id;
return true;
}
- if (idmap[i].old == old_id)
- return idmap[i].cur == cur_id;
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
}
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
+/* Similar to check_ids(), but allocate a unique temporary ID
+ * for 'old_id' or 'cur_id' of zero.
+ * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ return check_ids(old_id, cur_id, idmap);
+}
+
static void clean_func_state(struct bpf_verifier_env *env,
- struct bpf_func_state *st)
+ struct bpf_func_state *st,
+ u32 ip)
{
- enum bpf_reg_liveness live;
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
int i, j;
for (i = 0; i < BPF_REG_FP; i++) {
- live = st->regs[i].live;
/* liveness must not touch this register anymore */
- st->regs[i].live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ))
+ if (!(live_regs & BIT(i)))
/* since the register is unused, clear its state
* to make further comparison simpler
*/
@@ -10572,10 +18989,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
- live = st->stack[i].spilled_ptr.live;
- /* liveness must not touch this stack slot anymore */
- st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ)) {
+ if (!bpf_stack_slot_alive(env, st->frameno, i)) {
__mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
@@ -10586,43 +19000,41 @@ static void clean_func_state(struct bpf_verifier_env *env,
static void clean_verifier_state(struct bpf_verifier_env *env,
struct bpf_verifier_state *st)
{
- int i;
-
- if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
- /* all regs in this state in all frames were already marked */
- return;
+ int i, ip;
- for (i = 0; i <= st->curframe; i++)
- clean_func_state(env, st->frame[i]);
+ bpf_live_stack_query_init(env, st);
+ st->cleaned = true;
+ for (i = 0; i <= st->curframe; i++) {
+ ip = frame_insn_idx(st, i);
+ clean_func_state(env, st->frame[i], ip);
+ }
}
/* the parentage chains form a tree.
* the verifier states are added to state lists at given insn and
* pushed into state stack for future exploration.
- * when the verifier reaches bpf_exit insn some of the verifer states
+ * when the verifier reaches bpf_exit insn some of the verifier states
* stored in the state lists have their final liveness state already,
* but a lot of states will get revised from liveness point of view when
* the verifier explores other branches.
* Example:
- * 1: r0 = 1
+ * 1: *(u64)(r10 - 8) = 1
* 2: if r1 == 100 goto pc+1
- * 3: r0 = 2
- * 4: exit
- * when the verifier reaches exit insn the register r0 in the state list of
- * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. At the insn 4 it will walk the
- * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ * 3: *(u64)(r10 - 8) = 2
+ * 4: r0 = *(u64)(r10 - 8)
+ * 5: exit
+ * when the verifier reaches exit insn the stack slot -8 in the state list of
+ * insn 2 is not yet marked alive. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. After the insn 4 read, liveness
+ * analysis would propagate read mark for -8 at insn 2.
*
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
* their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to mark all liveness states
- * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
- * will not be used.
- * This function also clears the registers and stack for states that !READ
- * to simplify state merging.
+ * we can call this clean_live_states() function to clear dead the registers and stack
+ * slots to simplify state merging.
*
* Important note here that walking the same branch instruction in the callee
* doesn't meant that the states are DONE. The verifier has to compare
@@ -10632,103 +19044,135 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
struct bpf_verifier_state_list *sl;
- int i;
+ struct list_head *pos, *head;
- sl = *explored_state(env, insn);
- while (sl) {
+ head = explored_state(env, insn);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
if (sl->state.branches)
- goto next;
+ continue;
if (sl->state.insn_idx != insn ||
- sl->state.curframe != cur->curframe)
- goto next;
- for (i = 0; i <= cur->curframe; i++)
- if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
- goto next;
+ !same_callsites(&sl->state, cur))
+ continue;
+ if (sl->state.cleaned)
+ /* all regs in this state in all frames were already marked */
+ continue;
+ if (incomplete_read_marks(env, &sl->state))
+ continue;
clean_verifier_state(env, &sl->state);
-next:
- sl = sl->next;
}
}
-/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
{
- bool equal;
-
- if (!(rold->live & REG_LIVE_READ))
- /* explored state didn't use this */
- return true;
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ if (exact == EXACT)
+ return regs_exact(rold, rcur, idmap);
- if (equal)
- return true;
+ if (rold->type == NOT_INIT) {
+ if (exact == NOT_EXACT || rcur->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ }
- if (rold->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
- if (rcur->type == NOT_INIT)
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
return false;
+
switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (env->explore_alu_limits)
- return false;
- if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
- return true;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- } else {
- /* We're trying to use a pointer in place of a scalar.
- * Even if the scalar was unbounded, this could lead to
- * pointer leaks because scalars are allowed to leak
- * while pointers are not. We could make this safe in
- * special cases if root is calling us, but it's
- * probably not worth the hassle.
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
*/
- return false;
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
}
+ if (!rold->precise && exact == NOT_EXACT)
+ return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
+ /* Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (type_may_be_null(rold->type)) {
- if (!type_may_be_null(rcur->type))
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
- }
-
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * 'id' is not compared, since it's only used for maps with
- * bpf_spin_lock inside map element and in such cases if
- * the rest of the prog is valid for one map element then
- * it's valid for all map elements regardless of the key
- * used in bpf_map_lookup()
*/
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != rold->type)
- return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
* still safe. This is true even if old range < old off,
@@ -10743,34 +19187,66 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (rold->off != rcur->off)
return false;
/* id relations must be preserved */
- if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ if (!check_ids(rold->id, rcur->id, idmap))
return false;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ rold->off == rcur->off && range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
default:
- /* Don't know what's going on, just say it's not safe */
+ return regs_exact(rold, rcur, idmap);
+ }
+}
+
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ __mark_reg_unknown_imprecise(&unbound_reg);
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_stack_all_misc(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
+ continue;
return false;
}
- /* Shouldn't get here; if we do, say it's not safe */
- WARN_ON_ONCE(1);
- return false;
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ if (is_spilled_scalar_reg64(stack))
+ return &stack->spilled_ptr;
+
+ if (is_stack_all_misc(env, stack))
+ return &unbound_reg;
+
+ return NULL;
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_id_pair *idmap)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
{
int i, spi;
@@ -10779,23 +19255,43 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
- i += BPF_REG_SIZE - 1;
- /* explored state didn't use this */
- continue;
- }
+ if (exact != NOT_EXACT &&
+ (i >= cur->allocated_stack ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
+ return false;
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
/* explored stack has more populated slots than current stack
* and these slots were used
*/
if (i >= cur->allocated_stack)
return false;
+ /* 64-bit scalar spill vs all slots MISC and vice versa.
+ * Load from all slots MISC produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += BPF_REG_SIZE - 1;
+ continue;
+ }
+
/* if old state was safe with misc data in the stack
* it will be safe with zero-initialized stack.
* The opposite is not true
@@ -10813,10 +19309,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (!is_spilled_reg(&old->stack[spi]))
- continue;
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10827,17 +19322,98 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap, exact))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
+ struct bpf_idmap *idmap)
{
+ int i;
+
if (old->acquired_refs != cur->acquired_refs)
return false;
- return !memcmp(old->refs, cur->refs,
- sizeof(*old->refs) * old->acquired_refs);
+
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_locks != cur->active_rcu_locks)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
+ return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
+ }
+
+ return true;
}
/* compare two verifier states
@@ -10867,172 +19443,163 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur)
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
{
- int i;
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
+
+ if (old->callback_depth > cur->callback_depth)
+ return false;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(env, &old->regs[i], &cur->regs[i],
- env->idmap_scratch))
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
+ &env->idmap_scratch, exact))
return false;
- if (!stacksafe(env, old, cur, env->idmap_scratch))
- return false;
-
- if (!refsafe(old, cur))
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
return true;
}
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
+ struct bpf_verifier_state *cur,
+ enum exact_level exact)
{
+ u32 insn_idx;
int i;
if (old->curframe != cur->curframe)
return false;
+ reset_idmap_scratch(env);
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
for (i = 0; i <= old->curframe; i++) {
+ insn_idx = frame_insn_idx(old, i);
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
return false;
}
return true;
}
-/* Return 0 if no propagation happened. Return negative error code if error
- * happened. Otherwise, return the propagated bit.
- */
-static int propagate_liveness_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- struct bpf_reg_state *parent_reg)
-{
- u8 parent_flag = parent_reg->live & REG_LIVE_READ;
- u8 flag = reg->live & REG_LIVE_READ;
- int err;
-
- /* When comes here, read flags of PARENT_REG or REG could be any of
- * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
- * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
- */
- if (parent_flag == REG_LIVE_READ64 ||
- /* Or if there is no read flag from REG. */
- !flag ||
- /* Or if the read flag from REG is the same as PARENT_REG. */
- parent_flag == flag)
- return 0;
-
- err = mark_reg_read(env, reg, parent_reg, flag);
- if (err)
- return err;
-
- return flag;
-}
-
-/* A write screens off any subsequent reads; but write marks come from the
- * straight-line code between a state and its parent. When we arrive at an
- * equivalent state (jump target or such) we didn't arrive by the straight-line
- * code, so read marks in the state must propagate to the parent regardless
- * of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() is for.
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
*/
-static int propagate_liveness(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *vstate,
- struct bpf_verifier_state *vparent)
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ bool *changed)
{
- struct bpf_reg_state *state_reg, *parent_reg;
- struct bpf_func_state *state, *parent;
- int i, frame, err = 0;
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0, fr;
+ bool first;
- if (vparent->curframe != vstate->curframe) {
- WARN(1, "propagate_live: parent frame %d current frame %d\n",
- vparent->curframe, vstate->curframe);
- return -EFAULT;
- }
- /* Propagate read liveness of registers... */
- BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- for (frame = 0; frame <= vstate->curframe; frame++) {
- parent = vparent->frame[frame];
- state = vstate->frame[frame];
- parent_reg = parent->regs;
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
state_reg = state->regs;
- /* We don't need to worry about FP liveness, it's read-only */
- for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
- err = propagate_liveness_reg(env, &state_reg[i],
- &parent_reg[i]);
- if (err < 0)
- return err;
- if (err == REG_LIVE_READ64)
- mark_insn_zext(env, &parent_reg[i]);
+ first = true;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
}
- /* Propagate stack slots. */
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- parent_reg = &parent->stack[i].spilled_ptr;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
state_reg = &state->stack[i].spilled_ptr;
- err = propagate_liveness_reg(env, state_reg,
- parent_reg);
- if (err < 0)
- return err;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
}
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
+ verbose(env, "\n");
}
+
+ err = __mark_chain_precision(env, cur, -1, changed);
+ if (err < 0)
+ return err;
+
return 0;
}
-/* find precise scalars in the previous equivalent state and
- * propagate them into the current state
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
*/
-static int propagate_precision(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *old)
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
{
- struct bpf_reg_state *state_reg;
- struct bpf_func_state *state;
- int i, err = 0;
+ struct bpf_scc_backedge *backedge;
+ struct bpf_verifier_state *st;
+ bool changed;
+ int i, err;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ i = 0;
+ do {
+ if (i++ > MAX_BACKEDGE_ITERS) {
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "%s: too many iterations\n", __func__);
+ for (backedge = visit->backedges; backedge; backedge = backedge->next)
+ mark_all_scalars_precise(env, &backedge->state);
+ break;
+ }
+ changed = false;
+ for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+ st = &backedge->state;
+ err = propagate_precision(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ }
+ } while (changed);
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
- }
+ free_backedges(visit);
return 0;
}
@@ -11049,26 +19616,110 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
fcur = cur->frame[fr];
for (i = 0; i < MAX_BPF_REG; i++)
if (memcmp(&fold->regs[i], &fcur->regs[i],
- offsetof(struct bpf_reg_state, parent)))
+ offsetof(struct bpf_reg_state, frameno)))
return false;
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "infinite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * infinitely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl, **pprev;
+ struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
+ bool force_new_state, add_new_state, loop;
+ int n, err, states_cnt = 0;
+ struct list_head *pos, *tmp, *head;
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -11078,19 +19729,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
- pprev = explored_state(env, insn_idx);
- sl = *pprev;
-
clean_live_states(env, insn_idx, cur);
- while (sl) {
+ loop = false;
+ head = explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
states_cnt++;
if (sl->state.insn_idx != insn_idx)
- goto next;
+ continue;
if (sl->state.branches) {
struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
@@ -11108,10 +19760,92 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* Since the verifier still needs to catch infinite loops
* inside async callbacks.
*/
- } else if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ loop = true;
+ goto hit;
+ }
+ }
+ goto skip_inf_loop_check;
+ }
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ }
+ if (bpf_calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur, cur->curframe, true);
+ verbose(env, "old state:");
+ print_verifier_state(env, &sl->state, cur->curframe, true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -11126,34 +19860,118 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
- if (states_equal(env, &sl->state, cur)) {
+ /* See comments for mark_all_regs_read_and_precise() */
+ loop = incomplete_read_marks(env, &sl->state);
+ if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
+hit:
sl->hit_cnt++;
- /* reached equivalent register/stack state,
- * prune the search.
- * Registers read by the continuation are read by us.
- * If we have any write marks in env->cur_state, they
- * will prevent corresponding reads in the continuation
- * from reaching our parent (an explored_state). Our
- * own state will get the read marks recorded, but
- * they'll be immediately forgotten as we're pruning
- * this state and will pop a new one.
- */
- err = propagate_liveness(env, &sl->state, cur);
/* if previous state reached the exit with precision and
- * current state is equivalent to it (except precsion marks)
+ * current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
- err = err ? : push_jmp_history(env, cur);
- err = err ? : propagate_precision(env, &sl->state);
+ err = 0;
+ if (is_jmp_point(env, env->insn_idx))
+ err = push_jmp_history(env, cur, 0, 0);
+ err = err ? : propagate_precision(env, &sl->state, cur, NULL);
if (err)
return err;
+ /* When processing iterator based loops above propagate_liveness and
+ * propagate_precision calls are not sufficient to transfer all relevant
+ * read and precision marks. E.g. consider the following case:
+ *
+ * .-> A --. Assume the states are visited in the order A, B, C.
+ * | | | Assume that state B reaches a state equivalent to state A.
+ * | v v At this point, state C is not processed yet, so state A
+ * '-- B C has not received any read or precision marks from C.
+ * Thus, marks propagated from A to B are incomplete.
+ *
+ * The verifier mitigates this by performing the following steps:
+ *
+ * - Prior to the main verification pass, strongly connected components
+ * (SCCs) are computed over the program's control flow graph,
+ * intraprocedurally.
+ *
+ * - During the main verification pass, `maybe_enter_scc()` checks
+ * whether the current verifier state is entering an SCC. If so, an
+ * instance of a `bpf_scc_visit` object is created, and the state
+ * entering the SCC is recorded as the entry state.
+ *
+ * - This instance is associated not with the SCC itself, but with a
+ * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+ * the SCC and the SCC id. See `compute_scc_callchain()`.
+ *
+ * - When a verification path encounters a `states_equal(...,
+ * RANGE_WITHIN)` condition, there exists a call chain describing the
+ * current state and a corresponding `bpf_scc_visit` instance. A copy
+ * of the current state is created and added to
+ * `bpf_scc_visit->backedges`.
+ *
+ * - When a verification path terminates, `maybe_exit_scc()` is called
+ * from `update_branch_counts()`. For states with `branches == 0`, it
+ * checks whether the state is the entry state of any `bpf_scc_visit`
+ * instance. If it is, this indicates that all paths originating from
+ * this SCC visit have been explored. `propagate_backedges()` is then
+ * called, which propagates read and precision marks through the
+ * backedges until a fixed point is reached.
+ * (In the earlier example, this would propagate marks from A to B,
+ * from C to A, and then again from A to B.)
+ *
+ * A note on callchains
+ * --------------------
+ *
+ * Consider the following example:
+ *
+ * void foo() { loop { ... SCC#1 ... } }
+ * void main() {
+ * A: foo();
+ * B: ...
+ * C: foo();
+ * }
+ *
+ * Here, there are two distinct callchains leading to SCC#1:
+ * - (A, SCC#1)
+ * - (C, SCC#1)
+ *
+ * Each callchain identifies a separate `bpf_scc_visit` instance that
+ * accumulates backedge states. The `propagate_{liveness,precision}()`
+ * functions traverse the parent state of each backedge state, which
+ * means these parent states must remain valid (i.e., not freed) while
+ * the corresponding `bpf_scc_visit` instance exists.
+ *
+ * Associating `bpf_scc_visit` instances directly with SCCs instead of
+ * callchains would break this invariant:
+ * - States explored during `C: foo()` would contribute backedges to
+ * SCC#1, but SCC#1 would only be exited once the exploration of
+ * `A: foo()` completes.
+ * - By that time, the states explored between `A: foo()` and `C: foo()`
+ * (i.e., `B: ...`) may have already been freed, causing the parent
+ * links for states from `C: foo()` to become invalid.
+ */
+ if (loop) {
+ struct bpf_scc_backedge *backedge;
+
+ backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT);
+ if (!backedge)
+ return -ENOMEM;
+ err = copy_verifier_state(&backedge->state, cur);
+ backedge->state.equal_state = &sl->state;
+ backedge->state.insn_idx = insn_idx;
+ err = err ?: add_scc_backedge(env, &sl->state, backedge);
+ if (err) {
+ free_verifier_state(&backedge->state, false);
+ kfree(backedge);
+ return err;
+ }
+ }
return 1;
}
miss:
@@ -11169,45 +19987,32 @@ miss:
* to keep checking from state equivalence point of view.
* Higher numbers increase max_states_per_insn and verification time,
* but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
*/
- if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
- *pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
- u32 br = sl->state.branches;
-
- WARN_ONCE(br,
- "BUG live_done but branches_to_explore %d\n",
- br);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->peak_states--;
- } else {
- /* cannot free this state, since parentage chain may
- * walk it later. Add it for free_list instead to
- * be freed at the end of verification
- */
- sl->next = env->free_list;
- env->free_list = sl;
- }
- sl = *pprev;
- continue;
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
}
-next:
- pprev = &sl->next;
- sl = *pprev;
}
if (env->max_states_per_insn < states_cnt)
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -11218,14 +20023,19 @@ next:
* When looping the sl->state.branches will be > 0 and this state
* will not be considered for equivalence until branches == 0.
*/
- new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT);
if (!new_sl)
return -ENOMEM;
env->total_states++;
- env->peak_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -11235,45 +20045,21 @@ next:
return err;
}
new->insn_idx = insn_idx;
- WARN_ONCE(new->branches != 1,
- "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+ verifier_bug_if(new->branches != 1, env,
+ "%s:branches_to_explore=%d insn %d",
+ __func__, new->branches, insn_idx);
+ err = maybe_enter_scc(env, new);
+ if (err) {
+ free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
- new_sl->next = *explored_state(env, insn_idx);
- *explored_state(env, insn_idx) = new_sl;
- /* connect new state to parentage chain. Current frame needs all
- * registers connected. Only r6 - r9 of the callers are alive (pushed
- * to the stack implicitly by JITs) so in callers' frames connect just
- * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
- * the state of the call instruction (with WRITTEN set), and r0 comes
- * from callee with its full parentage chain, anyway.
- */
- /* clear write marks in current state: the writes we did are not writes
- * our child did, so they don't screen off its reads from us.
- * (There are no read marks in current state, because reads always mark
- * their parent and current state never has children yet. Only
- * explored_states can get read marks.)
- */
- for (j = 0; j <= cur->curframe; j++) {
- for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
- for (i = 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].live = REG_LIVE_NONE;
- }
-
- /* all stack frames are accessible from callee, clear them all */
- for (j = 0; j <= cur->curframe; j++) {
- struct bpf_func_state *frame = cur->frame[j];
- struct bpf_func_state *newframe = new->frame[j];
-
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
- frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
- frame->stack[i].spilled_ptr.parent =
- &newframe->stack[i].spilled_ptr;
- }
- }
+ list_add(&new_sl->node, head);
return 0;
}
@@ -11287,6 +20073,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_ARENA:
return false;
default:
return true;
@@ -11311,20 +20098,394 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type)
+{
+ switch (base_type(type)) {
+ case PTR_TO_MEM:
+ case PTR_TO_BTF_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool is_ptr_to_mem(enum bpf_reg_type type)
+{
+ return base_type(type) == PTR_TO_MEM;
+}
+
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+ enum bpf_reg_type merged_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_mismatch &&
+ is_ptr_to_mem_or_btf_id(type) &&
+ is_ptr_to_mem_or_btf_id(*prev_type)) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
+ * Same behavior of MEM_RDONLY flag.
+ */
+ if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type))
+ merged_type = PTR_TO_MEM;
+ else
+ merged_type = PTR_TO_BTF_ID;
+ if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED))
+ merged_type |= PTR_UNTRUSTED;
+ if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY))
+ merged_type |= MEM_RDONLY;
+ *prev_type = merged_type;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+enum {
+ PROCESS_BPF_EXIT = 1
+};
+
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state,
+ bool exception_exit)
+{
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback function,
+ * for which reference_state must match caller reference
+ * state when it exits.
+ */
+ int err = check_resource_leak(env, exception_exit,
+ !env->cur_state->curframe,
+ "BPF_EXIT instruction in main prog");
+ if (err)
+ return err;
+
+ /* The side effect of the prepare_func_exit which is
+ * being skipped is that it frees bpf_func_state.
+ * Typically, process_bpf_exit will only be hit with
+ * outermost exit. copy_verifier_state in pop_stack will
+ * handle freeing of any extra bpf_func_state left over
+ * from not processing all nested function exits. We
+ * also skip return code checks as they are not needed
+ * for exceptional exits.
+ */
+ if (exception_exit)
+ return PROCESS_BPF_EXIT;
+
+ if (env->cur_state->curframe) {
+ /* exit from nested function */
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ *do_print_state = true;
+ return 0;
+ }
+
+ err = check_return_code(env, BPF_REG_0, "R0");
+ if (err)
+ return err;
+ return PROCESS_BPF_EXIT;
+}
+
+static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
+ int regno,
+ struct bpf_map *map,
+ u32 *pmin_index, u32 *pmax_index)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ u64 min_index, max_index;
+ const u32 size = 8;
+
+ if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
+ (min_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
+ regno, reg->umin_value, reg->off);
+ return -ERANGE;
+ }
+ if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
+ (max_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
+ regno, reg->umax_value, reg->off);
+ return -ERANGE;
+ }
+
+ min_index /= size;
+ max_index /= size;
+
+ if (max_index >= map->max_entries) {
+ verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
+ regno, min_index, max_index, map->max_entries);
+ return -EINVAL;
+ }
+
+ *pmin_index = min_index;
+ *pmax_index = max_index;
+ return 0;
+}
+
+/* gotox *dst_reg */
+static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *dst_reg;
+ struct bpf_map *map;
+ u32 min_index, max_index;
+ int err = 0;
+ int n;
+ int i;
+
+ dst_reg = reg_state(env, insn->dst_reg);
+ if (dst_reg->type != PTR_TO_INSN) {
+ verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
+ insn->dst_reg, reg_type_str(env, dst_reg->type));
+ return -EINVAL;
+ }
+
+ map = dst_reg->map_ptr;
+ if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
+ return -EFAULT;
+
+ if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
+ "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
+ return -EFAULT;
+
+ err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
+ if (err)
+ return err;
+
+ /* Ensure that the buffer is large enough */
+ if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
+ env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
+ if (!env->gotox_tmp_buf)
+ return -ENOMEM;
+ }
+
+ n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ if (n < 0)
+ return n;
+ if (n == 0) {
+ verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
+ insn->dst_reg, map->id);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < n - 1; i++) {
+ other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
+ env->insn_idx, env->cur_state->speculative);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
+ }
+ env->insn_idx = env->gotox_tmp_buf->items[n-1];
+ return 0;
+}
+
+static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
+{
+ int err;
+ struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(env, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+
+ /* Check for reserved fields is already done in
+ * resolve_pseudo_ldimm64().
+ */
+ err = check_load_mem(env, insn, false, is_ldsx, true, "ldx");
+ if (err)
+ return err;
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ err = check_atomic(env, insn);
+ if (err)
+ return err;
+ env->insn_idx++;
+ return 0;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_store_reg(env, insn, false);
+ if (err)
+ return err;
+ } else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = cur_regs(env)[insn->dst_reg].type;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ u8 opcode = BPF_OP(insn->code);
+
+ env->jmps_processed++;
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL &&
+ insn->off != 0) ||
+ (insn->src_reg != BPF_REG_0 &&
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
+ insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
+ verbose(env, "BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_locks) {
+ if ((insn->src_reg == BPF_REG_0 &&
+ insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
+ verbose(env,
+ "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
+ }
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
+ err = check_func_call(env, insn, &env->insn_idx);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ err = check_kfunc_call(env, insn, &env->insn_idx);
+ if (!err && is_bpf_throw_kfunc(insn))
+ return process_bpf_exit_full(env, do_print_state, true);
+ } else {
+ err = check_helper_call(env, insn, &env->insn_idx);
+ }
+ if (err)
+ return err;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 ||
+ insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ return check_indirect_jump(env, insn);
+ }
+
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
+ return 0;
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
+ verbose(env, "BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+ return process_bpf_exit_full(env, do_print_state, false);
+ } else {
+ err = check_cond_jmp_op(env, insn, &env->insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ env->insn_idx++;
+ sanitize_mark_insn_seen(env);
+ } else {
+ verbose(env, "invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ env->insn_idx++;
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
for (;;) {
struct bpf_insn *insn;
- u8 class;
- int err;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, marks_err;
+
+ /* reset current history entry on each new instruction */
+ env->cur_hist_ent = NULL;
env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
@@ -11334,7 +20495,7 @@ static int do_check(struct bpf_verifier_env *env)
}
insn = &insns[env->insn_idx];
- class = BPF_CLASS(insn->code);
+ insn_aux = &env->insn_aux_data[env->insn_idx];
if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env,
@@ -11343,21 +20504,32 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+ state->insn_idx = env->insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state, 0, 0);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -11371,276 +20543,101 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
do_print_state = false;
}
if (env->log.level & BPF_LOG_LEVEL) {
- const struct bpf_insn_cbs cbs = {
- .cb_call = disasm_kfunc_name,
- .cb_print = verbose,
- .private_data = env,
- };
-
if (verifier_state_scratched(env))
- print_insn_state(env, state->frame[state->curframe]);
+ print_insn_state(env, state, state->curframe);
verbose_linfo(env, env->insn_idx, "; ");
- env->prev_log_len = env->log.len_used;
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
- env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
- env->prev_log_len = env->log.len_used;
+ verbose_insn(env, insn);
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
return err;
}
- regs = cur_regs(env);
sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
- if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(env, insn);
- if (err)
- return err;
-
- } else if (class == BPF_LDX) {
- enum bpf_reg_type *prev_src_type, src_reg_type;
-
- /* check for reserved fields is already done */
-
- /* check src operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
-
- err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
- if (err)
- return err;
-
- src_reg_type = regs[insn->src_reg].type;
+ /* Reduce verification complexity by stopping speculative path
+ * verification when a nospec is encountered.
+ */
+ if (state->speculative && insn_aux->nospec)
+ goto process_bpf_exit;
- /* check that memory (src_reg + off) is readable,
- * the state of dst_reg will be updated by this func
+ err = bpf_reset_stack_write_marks(env, env->insn_idx);
+ if (err)
+ return err;
+ err = do_check_insn(env, &do_print_state);
+ if (err >= 0 || error_recoverable_with_nospec(err)) {
+ marks_err = bpf_commit_stack_write_marks(env);
+ if (marks_err)
+ return marks_err;
+ }
+ if (error_recoverable_with_nospec(err) && state->speculative) {
+ /* Prevent this speculative path from ever reaching the
+ * insn that would have been unsafe to execute.
*/
- err = check_mem_access(env, env->insn_idx, insn->src_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false);
- if (err)
- return err;
-
- prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_src_type == NOT_INIT) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * save type to validate intersecting paths
- */
- *prev_src_type = src_reg_type;
-
- } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
- } else if (class == BPF_STX) {
- enum bpf_reg_type *prev_dst_type, dst_reg_type;
-
- if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, env->insn_idx, insn);
- if (err)
- return err;
- env->insn_idx++;
- continue;
- }
-
- if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
-
- /* check src1 operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ insn_aux->nospec = true;
+ /* If it was an ADD/SUB insn, potentially remove any
+ * markings for alu sanitization.
+ */
+ insn_aux->alu_state = 0;
+ goto process_bpf_exit;
+ } else if (err < 0) {
+ return err;
+ } else if (err == PROCESS_BPF_EXIT) {
+ goto process_bpf_exit;
+ }
+ WARN_ON_ONCE(err);
+
+ if (state->speculative && insn_aux->nospec_result) {
+ /* If we are on a path that performed a jump-op, this
+ * may skip a nospec patched-in after the jump. This can
+ * currently never happen because nospec_result is only
+ * used for the write-ops
+ * `*(size*)(dst_reg+off)=src_reg|imm32` which must
+ * never skip the following insn. Still, add a warning
+ * to document this in case nospec_result is used
+ * elsewhere in the future.
+ *
+ * All non-branch instructions have a single
+ * fall-through edge. For these, nospec_result should
+ * already work.
+ */
+ if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP ||
+ BPF_CLASS(insn->code) == BPF_JMP32, env,
+ "speculation barrier after jump instruction may not have the desired effect"))
+ return -EFAULT;
+process_bpf_exit:
+ mark_verifier_state_scratched(env);
+ err = update_branch_counts(env, env->cur_state);
if (err)
return err;
-
- dst_reg_type = regs[insn->dst_reg].type;
-
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false);
+ err = bpf_update_live_stack(env);
if (err)
return err;
-
- prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_dst_type == NOT_INIT) {
- *prev_dst_type = dst_reg_type;
- } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_ST uses reserved fields\n");
- return -EINVAL;
- }
- /* check src operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- if (is_ctx_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
- insn->dst_reg,
- reg_type_str(env, reg_state(env, insn->dst_reg)->type));
- return -EACCES;
- }
-
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, -1, false);
- if (err)
- return err;
-
- } else if (class == BPF_JMP || class == BPF_JMP32) {
- u8 opcode = BPF_OP(insn->code);
-
- env->jmps_processed++;
- if (opcode == BPF_CALL) {
- if (BPF_SRC(insn->code) != BPF_K ||
- (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
- && insn->off != 0) ||
- (insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL &&
- insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_CALL uses reserved fields\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
- }
- if (insn->src_reg == BPF_PSEUDO_CALL)
- err = check_func_call(env, insn, &env->insn_idx);
- else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
- err = check_kfunc_call(env, insn);
- else
- err = check_helper_call(env, insn, &env->insn_idx);
- if (err)
- return err;
- } else if (opcode == BPF_JA) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_JA uses reserved fields\n");
- return -EINVAL;
- }
-
- env->insn_idx += insn->off + 1;
- continue;
-
- } else if (opcode == BPF_EXIT) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_EXIT uses reserved fields\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_spin_lock) {
- verbose(env, "bpf_spin_unlock is missing\n");
- return -EINVAL;
- }
-
- if (state->curframe) {
- /* exit from nested function */
- err = prepare_func_exit(env, &env->insn_idx);
- if (err)
- return err;
- do_print_state = true;
- continue;
- }
-
- err = check_reference_leak(env);
- if (err)
+ err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
+ pop_log);
+ if (err < 0) {
+ if (err != -ENOENT)
return err;
-
- err = check_return_code(env);
- if (err)
- return err;
-process_bpf_exit:
- mark_verifier_state_scratched(env);
- update_branch_counts(env, env->cur_state);
- err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx, pop_log);
- if (err < 0) {
- if (err != -ENOENT)
- return err;
- break;
- } else {
- do_print_state = true;
- continue;
- }
- } else {
- err = check_cond_jmp_op(env, insn, &env->insn_idx);
- if (err)
- return err;
- }
- } else if (class == BPF_LD) {
- u8 mode = BPF_MODE(insn->code);
-
- if (mode == BPF_ABS || mode == BPF_IND) {
- err = check_ld_abs(env, insn);
- if (err)
- return err;
-
- } else if (mode == BPF_IMM) {
- err = check_ld_imm(env, insn);
- if (err)
- return err;
-
- env->insn_idx++;
- sanitize_mark_insn_seen(env);
+ break;
} else {
- verbose(env, "invalid BPF_LD mode\n");
- return -EINVAL;
+ do_print_state = true;
+ continue;
}
- } else {
- verbose(env, "unknown insn class %d\n", class);
- return -EINVAL;
}
-
- env->insn_idx++;
}
return 0;
@@ -11676,50 +20673,71 @@ static int find_btf_percpu_datasec(struct btf *btf)
return -ENOENT;
}
+/*
+ * Add btf to the used_btfs array and return the index. (If the btf was
+ * already added, then just return the index.) Upon successful insertion
+ * increase btf refcnt, and, if present, also refcount the corresponding
+ * kernel module.
+ */
+static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
+{
+ struct btf_mod_pair *btf_mod;
+ int i;
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++)
+ if (env->used_btfs[i].btf == btf)
+ return i;
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ verbose(env, "The total number of btfs per program has reached the limit of %u\n",
+ MAX_USED_BTFS);
+ return -E2BIG;
+ }
+
+ btf_get(btf);
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ btf_put(btf);
+ return -ENXIO;
+ }
+ }
+
+ return env->used_btf_cnt++;
+}
+
/* replace pseudo btf_id with kernel symbol address */
-static int check_pseudo_btf_id(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_insn_aux_data *aux)
+static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux,
+ struct btf *btf)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
- struct btf_mod_pair *btf_mod;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u32 type, id = insn->imm;
- struct btf *btf;
s32 datasec_id;
u64 addr;
- int i, btf_fd, err;
-
- btf_fd = insn[1].imm;
- if (btf_fd) {
- btf = btf_get_by_fd(btf_fd);
- if (IS_ERR(btf)) {
- verbose(env, "invalid module BTF object FD specified.\n");
- return -EINVAL;
- }
- } else {
- if (!btf_vmlinux) {
- verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
- return -EINVAL;
- }
- btf = btf_vmlinux;
- btf_get(btf);
- }
+ int i;
t = btf_type_by_id(btf, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
- if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
- err = -EINVAL;
- goto err_put;
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
+ return -EINVAL;
}
sym_name = btf_name_by_offset(btf, t->name_off);
@@ -11727,8 +20745,15 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
+ }
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ return 0;
}
datasec_id = find_btf_percpu_datasec(btf);
@@ -11742,13 +20767,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
}
}
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
-
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
- aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
@@ -11762,8 +20784,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
tname = btf_name_by_offset(btf, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
@@ -11773,46 +20794,42 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf_id = type;
}
- /* check whether we recorded this BTF (and maybe module) already */
- for (i = 0; i < env->used_btf_cnt; i++) {
- if (env->used_btfs[i].btf == btf) {
- btf_put(btf);
- return 0;
- }
- }
+ return 0;
+}
- if (env->used_btf_cnt >= MAX_USED_BTFS) {
- err = -E2BIG;
- goto err_put;
- }
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ struct btf *btf;
+ int btf_fd;
+ int err;
- btf_mod = &env->used_btfs[env->used_btf_cnt];
- btf_mod->btf = btf;
- btf_mod->module = NULL;
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ CLASS(fd, f)(btf_fd);
- /* if we reference variables from kernel module, bump its refcount */
- if (btf_is_module(btf)) {
- btf_mod->module = btf_try_get_module(btf);
- if (!btf_mod->module) {
- err = -ENXIO;
- goto err_put;
+ btf = __btf_get_by_fd(f);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
}
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
}
- env->used_btf_cnt++;
+ err = __check_pseudo_btf_id(env, insn, aux, btf);
+ if (err)
+ return err;
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
return 0;
-err_put:
- btf_put(btf);
- return err;
-}
-
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
}
static bool is_tracing_prog_type(enum bpf_prog_type type)
@@ -11822,19 +20839,17 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
return true;
default:
return false;
}
}
-static bool is_preallocated_map(struct bpf_map *map)
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
@@ -11843,37 +20858,22 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
+
+ if (map->excl_prog_sha &&
+ memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) {
+ verbose(env, "program's hash doesn't match map's excl_prog_hash\n");
+ return -EACCES;
+ }
+
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
- if (map_value_has_spin_lock(map)) {
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -11883,21 +20883,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
+ }
- if (prog->aux->sleepable) {
- verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n");
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
return -EINVAL;
}
}
- if (map_value_has_timer(map)) {
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ verbose(env, "tracing progs cannot use bpf_wq yet\n");
return -EINVAL;
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -11908,7 +20910,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (prog->aux->sleepable)
+ if (prog->sleepable)
switch (map->map_type) {
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_LRU_HASH:
@@ -11918,30 +20920,115 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
break;
default:
verbose(env,
- "Sleepable programs can only use array, hash, and ringbuf maps\n");
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
return -EINVAL;
}
+ if (bpf_map_is_cgroup_storage(map) &&
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
+ verbose(env, "only one cgroup storage of each type is allowed\n");
+ return -EBUSY;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
{
- return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ int i, err;
+
+ /* check whether we recorded this map already */
+ for (i = 0; i < env->used_map_cnt; i++)
+ if (env->used_maps[i] == map)
+ return i;
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
+ return -E2BIG;
+ }
+
+ err = check_map_prog_compatibility(env, map, env->prog);
+ if (err)
+ return err;
+
+ if (env->prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in bpf_free_used_maps()
+ */
+ bpf_map_inc(map);
+
+ env->used_maps[env->used_map_cnt++] = map;
+
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ err = bpf_insn_array_init(map, env->prog);
+ if (err) {
+ verbose(env, "Failed to properly initialize insn array\n");
+ return err;
+ }
+ env->insn_array_maps[env->insn_array_map_cnt++] = map;
+ }
+
+ return env->used_map_cnt - 1;
+}
+
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ CLASS(fd, f)(fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ return __add_used_map(env, map);
}
/* find and rewrite pseudo imm in ld_imm64 instructions:
@@ -11955,7 +21042,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j, err;
+ int i, err;
err = bpf_prog_calc_tag(env->prog);
if (err)
@@ -11963,7 +21050,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0)) {
verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
@@ -11971,7 +21059,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
- struct fd f;
+ int map_idx;
u64 addr;
u32 fd;
@@ -12034,21 +21122,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
break;
}
- f = fdget(fd);
- map = __bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn[0].imm);
- return PTR_ERR(map);
- }
-
- err = check_map_prog_compatibility(env, map, env->prog);
- if (err) {
- fdput(f);
- return err;
- }
+ map_idx = add_used_map(env, fd);
+ if (map_idx < 0)
+ return map_idx;
+ map = env->used_maps[map_idx];
aux = &env->insn_aux_data[i];
+ aux->map_index = map_idx;
+
if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
@@ -12057,13 +21138,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
- fdput(f);
return -EINVAL;
}
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
- fdput(f);
return -EINVAL;
}
@@ -12071,7 +21150,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (err) {
verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
map->value_size, off);
- fdput(f);
return err;
}
@@ -12082,38 +21160,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
- /* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++) {
- if (env->used_maps[j] == map) {
- aux->map_index = j;
- fdput(f);
- goto next_insn;
- }
- }
-
- if (env->used_map_cnt >= MAX_USED_MAPS) {
- fdput(f);
- return -E2BIG;
- }
-
- /* hold the map. If the program is rejected by verifier,
- * the map will be released by release_maps() or it
- * will be used by the valid program until it's unloaded
- * and all maps are released in free_used_maps()
- */
- bpf_map_inc(map);
-
- aux->map_index = env->used_map_cnt;
- env->used_maps[env->used_map_cnt++] = map;
-
- if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog->aux, map)) {
- verbose(env, "only one cgroup storage of each type is allowed\n");
- fdput(f);
- return -EBUSY;
- }
-
- fdput(f);
next_insn:
insn++;
i++;
@@ -12144,8 +21190,7 @@ static void release_maps(struct bpf_verifier_env *env)
/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
- __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
- env->used_btf_cnt);
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -12169,12 +21214,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_insn_aux_data *new_data,
struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
- u32 old_seen = old_data[off].seen;
+ u32 old_seen = data[off].seen;
u32 prog_len;
int i;
@@ -12182,22 +21226,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env,
* (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
* original insn at old prog.
*/
- old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
if (cnt == 1)
return;
prog_len = new_prog->len;
- memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
- memcpy(new_data + off + cnt - 1, old_data + off,
- sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
for (i = off; i < off + cnt - 1; i++) {
/* Expand insni[off]'s seen count to the patched range. */
- new_data[i].seen = old_seen;
- new_data[i].zext_dst = insn_has_def32(env, insn + i);
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
}
- env->insn_aux_data = new_data;
- vfree(old_data);
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -12214,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
+static void release_insn_arrays(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_release(env->insn_array_maps[i]);
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
@@ -12235,10 +21304,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
struct bpf_insn_aux_data *new_data = NULL;
if (len > 1) {
- new_data = vzalloc(array_size(env->prog->len + len - 1,
- sizeof(struct bpf_insn_aux_data)));
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_data)
return NULL;
+
+ env->insn_aux_data = new_data;
}
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
@@ -12247,15 +21320,53 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
- vfree(new_data);
return NULL;
}
- adjust_insn_aux_data(env, new_data, new_prog, off, len);
+ adjust_insn_aux_data(env, new_prog, off, len);
adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
}
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
+ continue;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
u32 off, u32 cnt)
{
@@ -12376,15 +21487,39 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
return 0;
}
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ clear_insn_aux_data(env, off, cnt);
+
err = bpf_remove_insns(env->prog, off, cnt);
if (err)
return err;
@@ -12397,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
if (err)
return err;
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
memmove(aux_data + off, aux_data + off + cnt,
sizeof(*aux_data) * (orig_prog_len - off - cnt));
@@ -12434,13 +21571,13 @@ static bool insn_is_cond_jump(u8 code)
{
u8 op;
+ op = BPF_OP(code);
if (BPF_CLASS(code) == BPF_JMP32)
- return true;
+ return op != BPF_JA;
if (BPF_CLASS(code) != BPF_JMP)
return false;
- op = BPF_OP(code);
return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}
@@ -12463,7 +21600,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -12494,22 +21631,29 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env)
return 0;
}
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
static int opt_remove_nops(struct bpf_verifier_env *env)
{
- const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
int i, err;
for (i = 0; i < insn_cnt; i++) {
- if (memcmp(&insn[i], &ja, sizeof(ja)))
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
continue;
err = verifier_remove_insns(env, i, 1);
if (err)
return err;
insn_cnt--;
- i--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
}
return 0;
@@ -12518,7 +21662,10 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
const union bpf_attr *attr)
{
- struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
+ struct bpf_insn *patch;
+ /* use env->insn_buf as two independent buffers */
+ struct bpf_insn *zext_patch = env->insn_buf;
+ struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
struct bpf_insn_aux_data *aux = env->insn_aux_data;
int i, patch_len, delta = 0, len = env->prog->len;
struct bpf_insn *insns = env->prog->insnsi;
@@ -12553,7 +21700,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
* BPF_STX + SRC_OP, so it is safe to pass NULL
* here.
*/
- if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
+ if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -12565,7 +21712,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;
@@ -12587,10 +21734,13 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
- if (WARN_ON(load_reg == -1)) {
- verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
return -EFAULT;
- }
zext_patch[0] = insn;
zext_patch[1].dst_reg = load_reg;
@@ -12617,25 +21767,54 @@ apply_patch_buffer:
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16], *insn;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "epilogue is too long");
+ return -EFAULT;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "gen_prologue is null");
+ return -EFAULT;
}
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
- if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ if (cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "prologue is too long");
+ return -EFAULT;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
if (!new_prog)
@@ -12643,24 +21822,55 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
- bool ctx_access;
+ u8 mode;
+
+ if (env->insn_aux_data[i + delta].nospec) {
+ WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = BPF_ST_NOSPEC();
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ /* This can not be easily merged with the
+ * nospec_result-case, because an insn may require a
+ * nospec before and after itself. Therefore also do not
+ * 'continue' here but potentially apply further
+ * patching to insn. *insn should equal patch[1] now.
+ */
+ }
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
type = BPF_READ;
- ctx_access = true;
} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
@@ -12670,20 +21880,48 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
} else {
continue;
}
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_spill) {
- struct bpf_insn patch[] = {
- *insn,
- BPF_ST_NOSPEC(),
- };
+ env->insn_aux_data[i + delta].nospec_result) {
+ /* nospec_result is only used to mitigate Spectre v4 and
+ * to limit verification-time for Spectre v1.
+ */
+ struct bpf_insn *patch = insn_buf;
- cnt = ARRAY_SIZE(patch);
- new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ *patch++ = *insn;
+ *patch++ = BPF_ST_NOSPEC();
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -12693,10 +21931,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
- if (!ctx_access)
- continue;
-
- switch (env->insn_aux_data[i + delta].ptr_type) {
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
continue;
@@ -12713,21 +21948,44 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
if (type == BPF_READ) {
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ }
+ env->prog->aux->num_exentries++;
+ continue;
default:
continue;
}
ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
/* If the read access is a narrower load of the field,
* convert to a 4/8-byte load, to minimum program type specific
@@ -12741,8 +21999,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose(env, "bpf verifier narrow ctx access misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "narrow ctx access misconfigured");
+ return -EFAULT;
}
size_code = BPF_H;
@@ -12758,18 +22016,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
target_size = 0;
cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
&target_size);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
+ return -EFAULT;
}
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
- if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier narrow ctx load misconfigured\n");
- return -EINVAL;
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
+ verifier_bug(env, "narrow ctx load misconfigured");
+ return -EFAULT;
}
if (ctx_field_size <= 4) {
if (shift)
@@ -12783,11 +22041,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
+patch_insn_buf:
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -12810,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
struct bpf_insn *insn;
void *old_bpf_func;
int err, num_exentries;
+ int old_len, subprog_start_adjustment = 0;
if (env->subprog_cnt <= 1)
return 0;
@@ -12823,11 +22087,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* propagated in any case.
*/
subprog = find_subprog(env, i + insn->imm + 1);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
return -EFAULT;
- }
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
@@ -12838,12 +22100,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
- if (bpf_pseudo_func(insn))
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
/* jit (e.g. x86_64) may emit fewer instructions
* if it learns a u32 imm is the same as a u64 imm.
- * Force a non zero here.
+ * Set close enough to possible prog address.
*/
- insn[1].imm = 1;
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -12875,12 +22144,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
for (j = 0; j < prog->aux->size_poke_tab; j++) {
struct bpf_jit_poke_descriptor *poke;
@@ -12891,28 +22164,55 @@ static int jit_subprogs(struct bpf_verifier_env *env)
poke->aux = func[i]->aux;
}
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
- * Long term would need debug info to populate names
- */
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM)
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
+ num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
+
+ /*
+ * To properly pass the absolute subprog start to jit
+ * all instruction adjustments should be accumulated
+ */
+ old_len = func[i]->len;
func[i] = bpf_int_jit_compile(func[i]);
+ subprog_start_adjustment += func[i]->len - old_len;
+
if (!func[i]->jited) {
err = -ENOTSUPP;
goto out_free;
@@ -12951,7 +22251,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
@@ -12964,14 +22265,28 @@ static int jit_subprogs(struct bpf_verifier_env *env)
cond_resched();
}
- /* finally lock prog and jit images for all functions and
- * populate kallsysm
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
*/
for (i = 0; i < env->subprog_cnt; i++) {
- bpf_prog_lock_ro(func[i]);
- bpf_prog_kallsyms_add(func[i]);
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
+ /* finally lock prog and jit images for all functions and
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
+ */
+ for (i = 1; i < env->subprog_cnt; i++) {
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
}
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
/* Last step: make now unused interpreter insns from main
* prog consistent for later dump requests, so they can
* later look the same as if they were interpreted only.
@@ -12992,8 +22307,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
@@ -13019,6 +22340,7 @@ out_free:
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (!bpf_pseudo_call(insn))
continue;
@@ -13040,7 +22362,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -13080,28 +22402,203 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+/* replace a generic kfunc with a specialized version if necessary */
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
{
- const struct bpf_kfunc_desc *desc;
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
+
+ if (offset) /* return if module BTF is used */
+ return 0;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
+ /* fallback to default kfunc when not supported by netdev */
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ }
+ desc->addr = addr;
+ return 0;
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+{
+ struct bpf_kfunc_desc *desc;
+ int err;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
return -EINVAL;
}
- /* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_base_call).
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
- verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
- insn->imm);
+ verifier_bug(env, "kernel function descriptor not found for func_id %u",
+ insn->imm);
return -EFAULT;
}
- insn->imm = desc->imm;
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+ if (!kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
+
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
+ }
+ return 0;
+}
+
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verifier_bug(env, "only one hidden subprog supported");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
return 0;
}
@@ -13112,58 +22609,202 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
enum bpf_attach_type eatype = prog->expected_attach_type;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
const struct bpf_map_ops *ops;
struct bpf_insn_aux_data *aux;
- struct bpf_insn insn_buf[16];
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
- int i, ret, cnt, delta = 0;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
+
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = env->prog->insnsi[insn_cnt - 1];
+ *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *patch++ = BPF_EXIT_INSN();
+ ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ insn->imm = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patch = insn_buf;
- for (i = 0; i < insn_cnt; i++, insn++) {
- /* Make divide-by-zero exceptions impossible. */
+ if (isdiv)
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ else
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- struct bpf_insn *patchlet;
- struct bpf_insn chk_and_div[] = {
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
+ struct bpf_insn *patch = insn_buf;
+
+ if (is_sdiv) {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else if (is_smod) {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1);
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ } else if (isdiv) {
/* [R,W]x div 0 -> 0 */
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JNE | BPF_K, insn->src_reg,
- 0, 2, 0),
- BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- *insn,
- };
- struct bpf_insn chk_and_mod[] = {
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0);
+ *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else {
/* [R,W]x mod 0 -> [R,W]x */
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, insn->src_reg,
- 0, 1 + (is64 ? 0 : 1), 0),
- *insn,
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
- };
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
- patchlet = isdiv ? chk_and_div : chk_and_mod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = insn_buf;
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
- new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
@@ -13171,9 +22812,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for ld_abs", cnt);
+ return -EFAULT;
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -13183,7 +22824,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Rewrite pointer arithmetic to mitigate speculation attacks. */
@@ -13191,14 +22832,14 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn *patch = &insn_buf[0];
+ struct bpf_insn *patch = insn_buf;
bool issrc, isneg, isimm;
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
if (!aux->alu_state ||
aux->alu_state == BPF_ALU_NON_POINTER)
- continue;
+ goto next_insn;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
@@ -13236,20 +22877,100 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
+ }
+
+ if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
if (insn->code != (BPF_JMP | BPF_CALL))
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_CALL)
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn);
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
- continue;
+ if (cnt == 0)
+ goto next_insn;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
@@ -13276,14 +22997,14 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
!bpf_map_ptr_unpriv(aux)) {
struct bpf_jit_poke_descriptor desc = {
.reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+ .tail_call.map = aux->map_ptr_state.map_ptr,
.tail_call.key = bpf_map_key_immediate(aux),
.insn_idx = i + delta,
};
@@ -13295,11 +23016,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
insn->imm = ret + 1;
- continue;
+ goto next_insn;
}
if (!bpf_map_ptr_unpriv(aux))
- continue;
+ goto next_insn;
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
@@ -13312,7 +23033,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
return -EINVAL;
}
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -13328,7 +23049,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
if (insn->imm == BPF_FUNC_timer_set_callback) {
@@ -13364,6 +23085,43 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (is_storage_get_function(insn->imm)) {
+ if (env->insn_aux_data[i + delta].non_sleepable)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -13376,21 +23134,22 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_pop_elem ||
insn->imm == BPF_FUNC_map_peek_elem ||
insn->imm == BPF_FUNC_redirect_map ||
- insn->imm == BPF_FUNC_for_each_map_elem)) {
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for map lookup", cnt);
+ return -EFAULT;
}
new_prog = bpf_patch_insn_data(env, i + delta,
@@ -13401,57 +23160,62 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (int (*)(struct bpf_map *map,
+ (long (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
void *callback_ctx,
u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_update_elem:
insn->imm = BPF_CALL_IMM(ops->map_update_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_delete_elem:
insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_push_elem:
insn->imm = BPF_CALL_IMM(ops->map_push_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_pop_elem:
insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_peek_elem:
insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_redirect_map:
insn->imm = BPF_CALL_IMM(ops->map_redirect);
- continue;
+ goto next_insn;
case BPF_FUNC_for_each_map_elem:
insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
- continue;
+ goto next_insn;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+ goto next_insn;
}
goto patch_call_imm;
@@ -13479,9 +23243,37 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
@@ -13504,7 +23296,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ret inline. */
@@ -13532,7 +23324,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement get_func_arg_cnt inline. */
@@ -13547,7 +23339,7 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ip inline. */
@@ -13562,21 +23354,152 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose(env,
- "kernel subsystem misconfigured func %s#%d\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env,
+ "not inlined functions %s#%d is missing func",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
+
+ if (!stack_slots)
+ continue;
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
+ verifier_bug(env, "stack_slots supports may_goto only");
+ return -EFAULT;
+ }
+
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
+ /* Copy first actual insn to preserve it */
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
}
/* Since poke tab is now finalized, publish aux to tracker. */
@@ -13585,8 +23508,8 @@ patch_call_imm:
if (!map_ptr->ops->map_poke_track ||
!map_ptr->ops->map_poke_untrack ||
!map_ptr->ops->map_poke_run) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "poke tab is misconfigured");
+ return -EFAULT;
}
ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
@@ -13596,44 +23519,232 @@ patch_call_imm:
}
}
- sort_kfunc_descs_by_imm(env->prog);
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *total_cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+ u32 cnt = 0;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+ return 0;
+}
+
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
return 0;
}
static void free_states(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state_list *sl, *sln;
- int i;
+ struct bpf_verifier_state_list *sl;
+ struct list_head *head, *pos, *tmp;
+ struct bpf_scc_info *info;
+ int i, j;
+
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ while (!pop_stack(env, NULL, NULL, false));
- sl = env->free_list;
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, &env->free_list) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->free_list = NULL;
+ INIT_LIST_HEAD(&env->free_list);
+
+ for (i = 0; i < env->scc_cnt; ++i) {
+ info = env->scc_info[i];
+ if (!info)
+ continue;
+ for (j = 0; j < info->num_visits; j++)
+ free_backedges(&info->visits[j]);
+ kvfree(info);
+ env->scc_info[i] = NULL;
+ }
if (!env->explored_states)
return;
for (i = 0; i < state_htab_size(env); i++) {
- sl = env->explored_states[i];
+ head = &env->explored_states[i];
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->explored_states[i] = NULL;
+ INIT_LIST_HEAD(&env->explored_states[i]);
}
}
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_prog_aux *aux = env->prog->aux;
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -13641,13 +23752,14 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
env->prev_linfo = NULL;
env->pass_cnt++;
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT);
if (!state)
return -ENOMEM;
state->curframe = 0;
state->speculative = false;
state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ state->in_sleepable = env->prog->sleepable;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT);
if (!state->frame[0]) {
kfree(state);
return -ENOMEM;
@@ -13657,61 +23769,112 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
- ret = btf_prepare_func_args(env, subprog, regs);
+ const char *sub_name = subprog_name(env, subprog);
+ struct bpf_subprog_arg_info *arg;
+ struct bpf_reg_state *reg;
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ ret = btf_prepare_func_args(env, subprog);
if (ret)
goto out;
- for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
- if (regs[i].type == PTR_TO_CTX)
+
+ if (subprog_is_exc_cb(env, subprog)) {
+ state->frame[0]->in_exception_callback_fn = true;
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
+ verbose(env, "exception cb only supports single integer argument\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+ arg = &sub->args[i - BPF_REG_1];
+ reg = &regs[i];
+
+ if (arg->arg_type == ARG_PTR_TO_CTX) {
+ reg->type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, i);
- else if (regs[i].type == SCALAR_VALUE)
+ } else if (arg->arg_type == ARG_ANYTHING) {
+ reg->type = SCALAR_VALUE;
mark_reg_unknown(env, regs, i);
- else if (base_type(regs[i].type) == PTR_TO_MEM) {
- const u32 mem_size = regs[i].mem_size;
-
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ /* assume unspecial LOCAL dynptr type */
+ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ reg->type = PTR_TO_MEM;
+ reg->type |= arg->arg_type &
+ (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY);
+ mark_reg_known_zero(env, regs, i);
+ reg->mem_size = arg->mem_size;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ reg->type = PTR_TO_BTF_ID;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
+ if (arg->arg_type & PTR_UNTRUSTED)
+ reg->type |= PTR_UNTRUSTED;
+ if (arg->arg_type & PTR_TRUSTED)
+ reg->type |= PTR_TRUSTED;
mark_reg_known_zero(env, regs, i);
- regs[i].mem_size = mem_size;
- regs[i].id = ++env->id_gen;
+ reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
+ reg->btf_id = arg->btf_id;
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /* caller can pass either PTR_TO_ARENA or SCALAR */
+ mark_reg_unknown(env, regs, i);
+ } else {
+ verifier_bug(env, "unhandled arg#%d type %d",
+ i - BPF_REG_1, arg->arg_type);
+ ret = -EFAULT;
+ goto out;
}
}
} else {
+ /* if main BPF program has associated BTF info, validate that
+ * it's matching expected signature, and otherwise mark BTF
+ * info for main program as unreliable
+ */
+ if (env->prog->aux->func_info_aux) {
+ ret = btf_prepare_func_args(env, 0);
+ if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+ env->prog->aux->func_info_aux[0].unreliable = true;
+ }
+
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_subprog_arg_match(env, subprog, regs);
- if (ret == -EFAULT)
- /* unlikely verifier bug. abort.
- * ret == 0 and ret < 0 are sadly acceptable for
- * main() function due to backward compatibility.
- * Like socket filter program may be written as:
- * int bpf_prog(struct pt_regs *ctx)
- * and never dereference that ctx in the program.
- * 'struct pt_regs' is a type mismatch for socket
- * filter that should be using 'struct __sk_buff'.
- */
- goto out;
+ }
+
+ /* Acquire references for struct_ops program arguments tagged with "__ref" */
+ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
+ acquire_reference(env, 0) : 0;
}
ret = do_check(env);
out:
- /* check for NULL is necessary, since cur_state can be freed inside
- * do_check() under memory pressure.
- */
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
- while (!pop_stack(env, NULL, NULL, false));
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
return ret;
}
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
* Consider:
* int bar(int);
* int foo(int f)
@@ -13730,25 +23893,50 @@ out:
static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
- int i, ret;
+ struct bpf_func_info_aux *sub_aux;
+ int i, ret, new_cnt;
if (!aux->func_info)
return 0;
+ /* exception callback is presumed to be always called */
+ if (env->exception_callback_subprog)
+ subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+ new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ if (!subprog_is_global(env, i))
continue;
+
+ sub_aux = subprog_aux(env, i);
+ if (!sub_aux->called || sub_aux->verified)
+ continue;
+
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
- verbose(env,
- "Func#%d is safe for any args that match its prototype\n",
- i);
+ verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+ i, subprog_name(env, i));
}
+
+ /* We verified new global subprog, it might have called some
+ * more global subprogs that we haven't verified yet, so we
+ * need to do another pass over subprogs to verify those.
+ */
+ sub_aux->verified = true;
+ new_cnt++;
}
+
+ /* We can't loop forever as we verify at least one global subprog on
+ * each pass.
+ */
+ if (new_cnt)
+ goto again;
+
return 0;
}
@@ -13788,29 +23976,57 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
+ const struct bpf_ctx_arg_aux *info, u32 cnt)
+{
+ prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT);
+ prog->aux->ctx_arg_info_size = cnt;
+
+ return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
+}
+
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops_desc *st_ops_desc;
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
- u32 btf_id, member_idx;
+ bool has_refcounted_arg = false;
+ u32 btf_id, member_idx, member_off;
+ struct btf *btf;
const char *mname;
+ int i, err;
if (!prog->gpl_compatible) {
verbose(env, "struct ops programs must have a GPL compatible license\n");
return -EINVAL;
}
+ if (!prog->aux->attach_btf_id)
+ return -ENOTSUPP;
+
+ btf = prog->aux->attach_btf;
+ if (btf_is_module(btf)) {
+ /* Make sure st_ops is valid through the lifetime of env */
+ env->attach_btf_mod = btf_try_get_module(btf);
+ if (!env->attach_btf_mod) {
+ verbose(env, "struct_ops module %s is not found\n",
+ btf_get_name(btf));
+ return -ENOTSUPP;
+ }
+ }
+
btf_id = prog->aux->attach_btf_id;
- st_ops = bpf_struct_ops_find(btf_id);
- if (!st_ops) {
+ st_ops_desc = bpf_struct_ops_find(btf, btf_id);
+ if (!st_ops_desc) {
verbose(env, "attach_btf_id %u is not a supported struct\n",
btf_id);
return -ENOTSUPP;
}
+ st_ops = st_ops_desc->st_ops;
- t = st_ops->type;
+ t = st_ops_desc->type;
member_idx = prog->expected_attach_type;
if (member_idx >= btf_type_vlen(t)) {
verbose(env, "attach to invalid member idx %u of struct %s\n",
@@ -13819,8 +24035,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
member = &btf_type_member(t)[member_idx];
- mname = btf_name_by_offset(btf_vmlinux, member->name_off);
- func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ mname = btf_name_by_offset(btf, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf, member->type,
NULL);
if (!func_proto) {
verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
@@ -13828,8 +24044,16 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ member_off = __btf_member_bit_offset(t, member) / 8;
+ err = bpf_struct_ops_supported(st_ops, member_off);
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -13838,11 +24062,37 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
+ for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
+ if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ has_refcounted_arg = true;
+ break;
+ }
+ }
+
+ /* Tail call is not allowed for programs with refcounted arguments since we
+ * cannot guarantee that valid refcounted kptrs will be passed to the callee.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (has_refcounted_arg && env->subprog_info[i].has_tail_call) {
+ verbose(env, "program with __ref argument cannot tail call\n");
+ return -EINVAL;
+ }
+ }
+
+ prog->aux->st_ops = st_ops;
+ prog->aux->attach_st_ops_member_off = member_off;
+
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
- return 0;
+ return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info,
+ st_ops_desc->arg_info[member_idx].cnt);
}
#define SECURITY_PREFIX "security_"
@@ -13863,8 +24113,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)
* Assume non-sleepable from bpf safety point of view.
*/
BTF_ID(func, __filemap_add_folio)
+#ifdef CONFIG_FAIL_PAGE_ALLOC
BTF_ID(func, should_fail_alloc_page)
+#endif
+#ifdef CONFIG_FAILSLAB
BTF_ID(func, should_failslab)
+#endif
BTF_SET_END(btf_non_sleepable_error_inject)
static int check_non_sleepable_error_inject(u32 btf_id)
@@ -13879,13 +24133,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
struct bpf_attach_target_info *tgt_info)
{
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+ bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
+ char trace_symbol[KSYM_SYMBOL_LEN];
const char prefix[] = "btf_trace_";
+ struct bpf_raw_event_map *btp;
int ret = 0, subprog = -1, i;
const struct btf_type *t;
bool conservative = true;
- const char *tname;
+ const char *tname, *fname;
struct btf *btf;
long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -13909,6 +24167,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
+ bool tgt_might_sleep;
+
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
@@ -13919,6 +24185,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "Subprog %s doesn't exist\n", tname);
return -EINVAL;
}
+ if (aux->func && aux->func[subprog]->aux->exception_cb) {
+ bpf_log(log,
+ "%s programs cannot attach to exception callback\n",
+ prog_extension ? "Extension" : "FENTRY/FEXIT");
+ return -EINVAL;
+ }
conservative = aux->func_info_aux[subprog].unreliable;
if (prog_extension) {
if (conservative) {
@@ -13931,15 +24203,43 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension programs should be JITed\n");
return -EINVAL;
}
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
+
+ tgt_might_sleep = aux->func
+ ? aux->func[subprog]->aux->might_sleep
+ : aux->might_sleep;
+ if (prog->aux->might_sleep && !tgt_might_sleep) {
+ bpf_log(log,
+ "Extension program may sleep, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
return -EINVAL;
}
- if (tgt_prog->type == prog->type) {
- /* Cannot fentry/fexit another fentry/fexit program.
- * Cannot attach program extension to another extension.
- * It's ok to attach fentry/fexit to extension program.
+ if (prog_tracing) {
+ if (aux->attach_tracing_prog) {
+ /*
+ * Target program is an fentry/fexit which is already attached
+ * to another tracing program. More levels of nesting
+ * attachment are not allowed.
+ */
+ bpf_log(log, "Cannot nest tracing program attach more than once\n");
+ return -EINVAL;
+ }
+ } else if (tgt_prog->type == prog->type) {
+ /*
+ * To avoid potential call chain cycles, prevent attaching of a
+ * program extension to another extension. It's ok to attach
+ * fentry/fexit to extension program.
*/
bpf_log(log, "Cannot recursively attach\n");
return -EINVAL;
@@ -13952,16 +24252,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
* analysis, stats and can be attached to any program
- * type except themselves. When extension program is
- * replacing XDP function it is necessary to allow
- * performance analysis of all functions. Both original
- * XDP program and its program extension. Hence
- * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
- * allowed. If extending of fentry/fexit was allowed it
- * would be possible to create long call chain
- * fentry->extension->fentry->extension beyond
- * reasonable stack size. Hence extending fentry is not
- * allowed.
+ * type. When extension program is replacing XDP function
+ * it is necessary to allow performance analysis of all
+ * functions. Both original XDP program and its program
+ * extension. Hence attaching fentry/fexit to
+ * BPF_PROG_TYPE_EXT is allowed. If extending of
+ * fentry/fexit was allowed it would be possible to create
+ * long call chain fentry->extension->fentry->extension
+ * beyond reasonable stack size. Hence extending fentry
+ * is not allowed.
*/
bpf_log(log, "Cannot extend fentry/fexit\n");
return -EINVAL;
@@ -13991,10 +24290,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
tname += sizeof(prefix) - 1;
- t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_ptr(t))
- /* should never happen in valid vmlinux build */
+
+ /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
+ * names. Thus using bpf_raw_event_map to get argument names.
+ */
+ btp = bpf_get_raw_tracepoint(tname);
+ if (!btp)
return -EINVAL;
+ fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
+ trace_symbol);
+ bpf_put_raw_tracepoint(btp);
+
+ if (fname)
+ ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);
+
+ if (!fname || ret < 0) {
+ bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
+ prefix, tname);
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ } else {
+ t = btf_type_by_id(btf, ret);
+ if (!btf_type_is_func(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ }
+
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
/* should never happen in valid vmlinux build */
@@ -14020,6 +24343,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -14052,8 +24376,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
+ module_put(mod);
bpf_log(log,
"The address of function %s cannot be found\n",
tname);
@@ -14061,16 +24394,27 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
}
- if (prog->aux->sleepable) {
+ if (prog->sleepable) {
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
- /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
+ prog);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -14083,16 +24427,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
break;
}
if (ret) {
+ module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
return ret;
}
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
if (tgt_prog) {
+ module_put(mod);
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
- ret = check_attach_modify_return(addr, tname);
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
if (ret) {
+ module_put(mod);
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
}
@@ -14103,20 +24453,75 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
tgt_info->tgt_addr = addr;
tgt_info->tgt_name = tname;
tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
return 0;
}
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
BTF_SET_END(btf_id_deny)
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -14128,16 +24533,15 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
u64 key;
if (prog->type == BPF_PROG_TYPE_SYSCALL) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
/* attach_btf_id checked to be zero already */
return 0;
verbose(env, "Syscall programs can only be sleepable\n");
return -EINVAL;
}
- if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM) {
- verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ if (prog->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -14165,6 +24569,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* store info about the attachment target that will be used later */
prog->aux->attach_func_proto = tgt_info.tgt_type;
prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
if (tgt_prog) {
prog->aux->saved_dst_prog_type = tgt_prog->type;
@@ -14175,9 +24580,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_btf_trace = true;
return 0;
} else if (prog->expected_attach_type == BPF_TRACE_ITER) {
- if (!bpf_iter_prog_supported(prog))
- return -EINVAL;
- return 0;
+ return bpf_iter_prog_supported(prog);
}
if (prog->type == BPF_PROG_TYPE_LSM) {
@@ -14186,6 +24589,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return ret;
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
+ verbose(env, "Attaching tracing programs to function '%s' is rejected.\n",
+ tgt_info.tgt_name);
+ return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+ tgt_info.tgt_name);
return -EINVAL;
}
@@ -14194,6 +24605,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (!tr)
return -ENOMEM;
+ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
prog->aux->dst_trampoline = tr;
return 0;
}
@@ -14209,14 +24623,499 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+/*
+ * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
+ * this case expect that every file descriptor in the array is either a map or
+ * a BTF. Everything else is considered to be trash.
+ */
+static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+ int err;
+
+ map = __bpf_map_get(f);
+ if (!IS_ERR(map)) {
+ err = __add_used_map(env, map);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf)) {
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
+ return PTR_ERR(map);
+}
+
+static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
+{
+ size_t size = sizeof(int);
+ int ret;
+ int fd;
+ u32 i;
+
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
+
+ /*
+ * The only difference between old (no fd_array_cnt is given) and new
+ * APIs is that in the latter case the fd_array is expected to be
+ * continuous and is scanned for map fds right away
+ */
+ if (!attr->fd_array_cnt)
+ return 0;
+
+ /* Check for integer overflow */
+ if (attr->fd_array_cnt >= (U32_MAX / size)) {
+ verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < attr->fd_array_cnt; i++) {
+ if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
+ return -EFAULT;
+
+ ret = add_fd_from_fd_array(env, fd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+static int compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ struct bpf_iarray *succ;
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ if (env->insn_aux_data[i].scc)
+ verbose(env, "%3d ", env->insn_aux_data[i].scc);
+ else
+ verbose(env, " ");
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ return err;
+}
+
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+static int compute_scc(struct bpf_verifier_env *env)
+{
+ const u32 NOT_ON_STACK = U32_MAX;
+
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ const u32 insn_cnt = env->prog->len;
+ int stack_sz, dfs_sz, err = 0;
+ u32 *stack, *pre, *low, *dfs;
+ u32 i, j, t, w;
+ u32 next_preorder_num;
+ u32 next_scc_id;
+ bool assign_scc;
+ struct bpf_iarray *succ;
+
+ next_preorder_num = 1;
+ next_scc_id = 1;
+ /*
+ * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+ * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+ * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+ * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+ */
+ stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
+ if (!stack || !pre || !low || !dfs) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * References:
+ * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+ * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+ *
+ * The algorithm maintains the following invariant:
+ * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+ * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+ *
+ * Consequently:
+ * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+ * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+ * and thus there is an SCC (loop) containing both 'u' and 'v'.
+ * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+ * and 'v' can be considered the root of some SCC.
+ *
+ * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+ *
+ * NOT_ON_STACK = insn_cnt + 1
+ * pre = [0] * insn_cnt
+ * low = [0] * insn_cnt
+ * scc = [0] * insn_cnt
+ * stack = []
+ *
+ * next_preorder_num = 1
+ * next_scc_id = 1
+ *
+ * def recur(w):
+ * nonlocal next_preorder_num
+ * nonlocal next_scc_id
+ *
+ * pre[w] = next_preorder_num
+ * low[w] = next_preorder_num
+ * next_preorder_num += 1
+ * stack.append(w)
+ * for s in successors(w):
+ * # Note: for classic algorithm the block below should look as:
+ * #
+ * # if pre[s] == 0:
+ * # recur(s)
+ * # low[w] = min(low[w], low[s])
+ * # elif low[s] != NOT_ON_STACK:
+ * # low[w] = min(low[w], pre[s])
+ * #
+ * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+ * # does not break the invariant and makes itartive version of the algorithm
+ * # simpler. See 'Algorithm #3' from [2].
+ *
+ * # 's' not yet visited
+ * if pre[s] == 0:
+ * recur(s)
+ * # if 's' is on stack, pick lowest reachable preorder number from it;
+ * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+ * # so 'min' would be a noop.
+ * low[w] = min(low[w], low[s])
+ *
+ * if low[w] == pre[w]:
+ * # 'w' is the root of an SCC, pop all vertices
+ * # below 'w' on stack and assign same SCC to them.
+ * while True:
+ * t = stack.pop()
+ * low[t] = NOT_ON_STACK
+ * scc[t] = next_scc_id
+ * if t == w:
+ * break
+ * next_scc_id += 1
+ *
+ * for i in range(0, insn_cnt):
+ * if pre[i] == 0:
+ * recur(i)
+ *
+ * Below implementation replaces explicit recursion with array 'dfs'.
+ */
+ for (i = 0; i < insn_cnt; i++) {
+ if (pre[i])
+ continue;
+ stack_sz = 0;
+ dfs_sz = 1;
+ dfs[0] = i;
+dfs_continue:
+ while (dfs_sz) {
+ w = dfs[dfs_sz - 1];
+ if (pre[w] == 0) {
+ low[w] = next_preorder_num;
+ pre[w] = next_preorder_num;
+ next_preorder_num++;
+ stack[stack_sz++] = w;
+ }
+ /* Visit 'w' successors */
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
+ } else {
+ dfs[dfs_sz++] = succ->items[j];
+ goto dfs_continue;
+ }
+ }
+ /*
+ * Preserve the invariant: if some vertex above in the stack
+ * is reachable from 'w', keep 'w' on the stack.
+ */
+ if (low[w] < pre[w]) {
+ dfs_sz--;
+ goto dfs_continue;
+ }
+ /*
+ * Assign SCC number only if component has two or more elements,
+ * or if component has a self reference.
+ */
+ assign_scc = stack[stack_sz - 1] != w;
+ for (j = 0; j < succ->cnt; ++j) {
+ if (succ->items[j] == w) {
+ assign_scc = true;
+ break;
+ }
+ }
+ /* Pop component elements from stack */
+ do {
+ t = stack[--stack_sz];
+ low[t] = NOT_ON_STACK;
+ if (assign_scc)
+ aux[t].scc = next_scc_id;
+ } while (t != w);
+ if (assign_scc)
+ next_scc_id++;
+ dfs_sz--;
+ }
+ }
+ env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT);
+ if (!env->scc_info) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ env->scc_cnt = next_scc_id;
+exit:
+ kvfree(stack);
+ kvfree(pre);
+ kvfree(low);
+ kvfree(dfs);
+ return err;
+}
+
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
+ BTF_TYPE_EMIT(enum bpf_features);
+
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
return -EINVAL;
@@ -14224,10 +25123,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT);
if (!env)
return -ENOMEM;
- log = &env->log;
+
+ env->bt.env = env;
len = (*prog)->len;
env->insn_aux_data =
@@ -14237,10 +25137,17 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
+ env->succ = iarray_realloc(NULL, 2);
+ if (!env->succ)
+ goto err_free_env;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
- is_priv = bpf_capable();
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+ env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+ env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+ env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+ env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
bpf_get_btf_vmlinux();
@@ -14248,20 +25155,18 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- ret = -EINVAL;
- goto err_unlock;
- }
- }
+ ret = process_fd_array(env, attr, uattr);
+ if (ret)
+ goto skip_full_check;
mark_verifier_state_clean(env);
@@ -14278,23 +25183,25 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = bpf_allow_ptr_leaks();
- env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
- env->bypass_spec_v1 = bpf_bypass_spec_v1();
- env->bypass_spec_v4 = bpf_bypass_spec_v4();
- env->bpf_capable = bpf_capable();
-
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+ env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
env->explored_states = kvcalloc(state_htab_size(env),
- sizeof(struct bpf_verifier_state_list *),
- GFP_USER);
+ sizeof(struct list_head),
+ GFP_KERNEL_ACCOUNT);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
+ for (i = 0; i < state_htab_size(env); i++)
+ INIT_LIST_HEAD(&env->explored_states[i]);
+ INIT_LIST_HEAD(&env->free_list);
+
+ ret = check_btf_info_early(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = add_subprog_and_kfunc(env);
if (ret < 0)
goto skip_full_check;
@@ -14307,15 +25214,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -14325,19 +25228,52 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- ret = do_check_subprogs(env);
- ret = ret ?: do_check_main(env);
+ ret = compute_postorder(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_stack_liveness_init(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = compute_scc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = compute_live_registers(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = mark_fastcall_patterns(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check_main(env);
+ ret = ret ?: do_check_subprogs(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
kvfree(env->explored_states);
+ /* might decrease stack depth, keep it before passes that
+ * allocate additional slots.
+ */
+ if (ret == 0)
+ ret = remove_fastcall_spills_fills(env);
+
if (ret == 0)
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
@@ -14360,7 +25296,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
@@ -14373,9 +25309,14 @@ skip_full_check:
print_verification_stats(env);
env->prog->aux->verified_insns = env->insn_processed;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
+
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}
@@ -14387,7 +25328,7 @@ skip_full_check:
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
@@ -14402,7 +25343,7 @@ skip_full_check:
/* if program passed verifier, update used_btfs in bpf_prog_aux */
env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
sizeof(env->used_btfs[0]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!env->prog->aux->used_btfs) {
ret = -ENOMEM;
goto err_release_maps;
@@ -14422,6 +25363,8 @@ skip_full_check:
adjust_btf_func(env);
err_release_maps:
+ if (ret)
+ release_insn_arrays(env);
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_used_maps() will release them.
@@ -14437,11 +25380,19 @@ err_release_maps:
env->prog->expected_attach_type = 0;
*prog = env->prog;
+
+ module_put(env->attach_btf_mod);
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
+ clear_insn_aux_data(env, 0, env->prog->len);
vfree(env->insn_aux_data);
err_free_env:
- kfree(env);
+ bpf_stack_liveness_free(env);
+ kvfree(env->cfg.insn_postorder);
+ kvfree(env->scc_info);
+ kvfree(env->succ);
+ kvfree(env->gotox_tmp_buf);
+ kvfree(env);
return ret;
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..829f49ae07b9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -45,10 +38,8 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
@@ -69,10 +60,8 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
@@ -119,7 +108,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
int ret;
if (pid && (pid != task_pid_vnr(current))) {
- struct task_struct *target;
+ const struct task_struct *target;
rcu_read_lock();
@@ -151,6 +140,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +153,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -221,8 +215,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -246,21 +240,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
@@ -304,22 +286,6 @@ bool has_ns_capability(struct task_struct *t,
}
/**
- * has_capability - Does a task have a capability in init_user_ns
- * @t: The task in question
- * @cap: The capability to be tested for
- *
- * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
- *
- * Note that this does not set PF_SUPERPRIV on the task.
- */
-bool has_capability(struct task_struct *t, int cap)
-{
- return has_ns_capability(t, &init_user_ns, cap);
-}
-EXPORT_SYMBOL(has_capability);
-
-/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
@@ -360,6 +326,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
@@ -480,20 +447,22 @@ EXPORT_SYMBOL(file_ns_capable);
/**
* privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
* @ns: The user namespace in question
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
*
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
- kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
* @cap: The capability in question
*
@@ -501,13 +470,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 9594cfd1cf2c..4dad04ead06c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,329 +1,115 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
*
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
*/
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
+#include <linux/bpf.h>
+#include <linux/cfi_types.h>
+#include <linux/cfi.h>
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
-#endif
+bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
-static inline void handle_cfi_failure(void *ptr)
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
{
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
- WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
else
- panic("CFI failure (target: %pS)\n", ptr);
-}
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID ((shadow_t)~0UL)
+ if (cfi_warn) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
+ }
-struct cfi_shadow {
- /* Page index for the beginning of the shadow */
- unsigned long base;
- /* An array of __cfi_check locations (as indices to the shadow) */
- shadow_t shadow[1];
-} __packed;
+ return BUG_TRAP_TYPE_BUG;
+}
/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
+ * Declare two non-existent functions with types that match bpf_func_t and
+ * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash
+ * variables for each function type. The cfi_bpf_* variables are used by
+ * arch-specific BPF JIT implementations to ensure indirectly callable JIT
+ * code has matching CFI type hashes.
*/
-#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
-
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
- unsigned long index;
- unsigned long page = ptr >> PAGE_SHIFT;
-
- if (unlikely(page < s->base))
- return -1; /* Outside of module area */
-
- index = page - s->base;
-
- if (index >= SHADOW_ARR_SLOTS)
- return -1; /* Cannot be addressed with shadow */
-
- return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- return (s->base + index) << PAGE_SHIFT;
-}
-
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
+extern typeof(*(bpf_func_t)0) __bpf_prog_runX;
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
- if (unlikely(s->shadow[index] == SHADOW_INVALID))
- return 0;
+extern typeof(*(bpf_callback_t)0) __bpf_callback_fn;
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
- /* __cfi_check is always page aligned */
- return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
-
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
- struct cfi_shadow *next)
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
{
- int i, index, check;
-
- /* Mark everything invalid */
- memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
- if (!prev)
- return; /* No previous shadow */
-
- /* If the base address didn't change, an update is not needed */
- if (prev->base == next->base) {
- memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
- return;
- }
-
- /* Convert the previous shadow to the new address range */
- for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
- if (prev->shadow[i] == SHADOW_INVALID)
- continue;
-
- index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
- if (index < 0)
- continue;
-
- check = ptr_to_shadow(next,
- shadow_to_check_fn(prev, prev->shadow[i]));
- if (check < 0)
- continue;
-
- next->shadow[index] = (shadow_t)check;
- }
+ return (unsigned long)((long)p + (long)*p);
}
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
{
- int check_index;
- unsigned long check = (unsigned long)mod->cfi_check;
- unsigned long ptr;
+ s32 *p;
- if (unlikely(!PAGE_ALIGNED(check))) {
- pr_warn("cfi: not using shadow for module %s\n", mod->name);
- return;
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
}
- check_index = ptr_to_shadow(s, check);
- if (check_index < 0)
- return; /* Module not addressable with shadow */
-
- /* For each page, store the check function index in the shadow */
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0) {
- /* Each page must only contain one module */
- WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
- s->shadow[index] = (shadow_t)check_index;
- }
- }
-}
-
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
-{
- unsigned long ptr;
-
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0)
- s->shadow[index] = SHADOW_INVALID;
- }
+ return false;
}
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
- unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
- update_shadow_fn fn)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
{
- struct cfi_shadow *prev;
- struct cfi_shadow *next;
- unsigned long min_addr, max_addr;
+ char *secstrings;
+ unsigned int i;
- next = vmalloc(SHADOW_SIZE);
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
- mutex_lock(&shadow_update_lock);
- prev = rcu_dereference_protected(cfi_shadow,
- mutex_is_locked(&shadow_update_lock));
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- if (next) {
- next->base = base_addr >> PAGE_SHIFT;
- prepare_next_shadow(prev, next);
-
- min_addr = (unsigned long)mod->core_layout.base;
- max_addr = min_addr + mod->core_layout.text_size;
- fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
- set_memory_ro((unsigned long)next, SHADOW_PAGES);
- }
-
- rcu_assign_pointer(cfi_shadow, next);
- mutex_unlock(&shadow_update_lock);
- synchronize_rcu();
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
- if (prev) {
- set_memory_rw((unsigned long)prev, SHADOW_PAGES);
- vfree(prev);
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
}
}
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, add_module_to_shadow);
-}
-
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, remove_module_from_shadow);
-}
-
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
- unsigned long ptr)
-{
- int index;
-
- if (unlikely(!s))
- return NULL; /* No shadow available */
-
- index = ptr_to_shadow(s, ptr);
- if (index < 0)
- return NULL; /* Cannot be addressed with shadow */
-
- return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn;
-
- rcu_read_lock_sched_notrace();
- fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched_notrace();
-
- return fn;
-}
-
-#else /* !CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+static bool is_module_cfi_trap(unsigned long addr)
{
- return NULL;
-}
-
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
struct module *mod;
+ bool found = false;
- rcu_read_lock_sched_notrace();
- mod = __module_address(ptr);
+ guard(rcu)();
+ mod = __module_address(addr);
if (mod)
- fn = mod->cfi_check;
- rcu_read_unlock_sched_notrace();
-
- return fn;
-}
-
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
-
- if (is_kernel_text(ptr))
- return __cfi_check;
-
- /*
- * Indirect call checks can happen when RCU is not watching. Both
- * the shadow and __module_address use RCU, so we need to wake it
- * up if necessary.
- */
- RCU_NONIDLE({
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- if (!fn)
- fn = find_module_check_fn(ptr);
- });
-
- return fn;
+ return found;
}
-
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = find_check_fn((unsigned long)ptr);
-
- if (likely(fn))
- fn(id, ptr, diag);
- else /* Don't allow unchecked modules */
- handle_cfi_failure(ptr);
+ return false;
}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+#endif /* CONFIG_MODULES */
-#else /* !CONFIG_MODULES */
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+bool is_cfi_trap(unsigned long addr)
{
- handle_cfi_failure(ptr); /* No modules */
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
-#endif /* CONFIG_MODULES */
-
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
-{
- handle_cfi_failure(ptr);
+ return is_module_cfi_trap(addr);
}
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 12f8457ad1f9..ede31601a363 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,5 +5,7 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
obj-$(CONFIG_CGROUP_MISC) += misc.o
+obj-$(CONFIG_CGROUP_DMEM) += dmem.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 6e36e854b512..22051b4f1ccb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);
/*
@@ -82,6 +81,8 @@ struct cgroup_file_ctx {
struct {
struct cgroup_pidlist *pidlist;
} procs1;
+
+ struct cgroup_of_peak peak;
};
/*
@@ -165,15 +166,14 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
-extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
+extern bool cgrp_dfl_visible;
/* iterate across the hierarchies */
#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
/**
* for_each_subsys - iterate all enabled cgroup subsystems
@@ -223,8 +223,6 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
-bool cgroup_is_thread_root(struct cgroup *cgrp);
-bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -234,6 +232,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
@@ -250,10 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
+ enum cgroup_attach_lock_mode *lock_mode)
__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
__releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
@@ -269,9 +273,9 @@ int cgroup_task_count(const struct cgroup *cgrp);
/*
* rstat.c
*/
-int cgroup_rstat_init(struct cgroup *cgrp);
-void cgroup_rstat_exit(struct cgroup *cgrp);
-void cgroup_rstat_boot(void);
+int css_rstat_init(struct cgroup_subsys_state *css);
+void css_rstat_exit(struct cgroup_subsys_state *css);
+int ss_rstat_init(struct cgroup_subsys *ss);
void cgroup_base_stat_cputime_show(struct seq_file *seq);
/*
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 41e0837a5a0b..a9e029b570c8 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -10,6 +10,7 @@
#include <linux/sched/task.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
@@ -32,6 +33,9 @@ static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -46,6 +50,12 @@ bool cgroup1_ssid_disabled(int ssid)
return cgroup_no_v1_mask & (1 << ssid);
}
+static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
+{
+ /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
+ return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
+}
+
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
@@ -58,8 +68,8 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
struct cgroup_root *root;
int retval = 0;
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_lock();
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -71,8 +81,8 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+ cgroup_unlock();
return retval;
}
@@ -106,9 +116,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -144,8 +154,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+ cgroup_unlock();
return ret;
}
@@ -360,10 +370,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
css_task_iter_end(&it);
length = n;
- /* now sort & (if procs) strip out duplicates */
+ /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
@@ -431,7 +440,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
if (l->list[mid] == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (l->list[mid] < pid)
index = mid + 1;
else
end = mid;
@@ -494,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
- bool locked;
+ enum cgroup_attach_lock_mode lock_mode;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -523,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -546,14 +555,24 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ strscpy(cgrp->root->release_agent_path, strstrip(buf),
sizeof(cgrp->root->release_agent_path));
spin_unlock(&release_agent_path_lock);
cgroup_kn_unlock(of->kn);
@@ -658,6 +677,7 @@ struct cftype cgroup1_base_files[] = {
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
+ bool cgrp_v1_visible = false;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -666,11 +686,21 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
* cgroup_mutex contention.
*/
- for_each_subsys(ss, i)
+ for_each_subsys(ss, i) {
+ cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
+ }
+
+ if (cgrp_dfl_visible && !cgrp_v1_visible)
+ pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n");
+
return 0;
}
@@ -787,13 +817,13 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
spin_lock(&release_agent_path_lock);
- strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+ strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- if (ret < 0 || ret >= PATH_MAX)
+ if (ret < 0)
goto out_free;
argv[0] = agentbuf;
@@ -826,7 +856,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (kn->parent != new_parent)
+ if (rcu_access_pointer(kn->__parent) != new_parent)
return -EIO;
/*
@@ -837,13 +867,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
kernfs_break_active_protection(new_parent);
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
kernfs_unbreak_active_protection(new_parent);
@@ -865,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -888,6 +920,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -899,6 +933,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -917,7 +953,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (ret != -ENOPARAM)
return ret;
for_each_subsys(ss, i) {
- if (strcmp(param->key, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name) ||
+ cgroup1_subsys_absent(ss))
continue;
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
return invalfc(fc, "Disabled controller '%s'",
@@ -950,10 +987,22 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -997,7 +1046,8 @@ static int check_cgroupfs_options(struct fs_context *fc)
mask = ~((u16)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
+ !cgroup1_subsys_absent(ss))
enabled |= 1 << i;
ctx->subsys_mask &= enabled;
@@ -1084,14 +1134,14 @@ int cgroup1_reconfigure(struct fs_context *fc)
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, ctx->release_agent);
+ strscpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1195,8 +1245,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
@@ -1215,7 +1268,7 @@ int cgroup1_get_tree(struct fs_context *fc)
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (!ret)
ret = cgroup_do_get_tree(fc);
@@ -1232,6 +1285,40 @@ int cgroup1_get_tree(struct fs_context *fc)
return ret;
}
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
static int __init cgroup1_wq_init(void)
{
/*
@@ -1239,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
+ WQ_PERCPU, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
@@ -1271,8 +1358,15 @@ static int __init cgroup_no_v1(char *str)
continue;
cgroup_no_v1_mask |= 1 << i;
+ break;
}
}
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b31e1465868a..e717208cfb18 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -57,7 +57,10 @@
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
+#include <linux/irq_work.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -89,14 +92,17 @@
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
-#ifdef CONFIG_PROVE_RCU
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
+struct blocking_notifier_head cgroup_lifetime_notifier =
+ BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);
+
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -120,10 +126,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -160,17 +189,21 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
+static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
+static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
/* the default hierarchy */
-struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
+struct cgroup_root cgrp_dfl_root = {
+ .cgrp.self.rstat_cpu = &root_rstat_cpu,
+ .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
+};
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_visible;
+bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
@@ -206,17 +239,26 @@ static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
.user_ns = &init_user_ns,
- .ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
/* cgroup optional features */
enum cgroup_opt_features {
@@ -246,6 +288,13 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static void cgroup_rt_init(void);
+
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
@@ -279,8 +328,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@@ -307,8 +354,6 @@ bool cgroup_ssid_enabled(int ssid)
* masks of ancestors.
*
* - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
@@ -351,7 +396,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp)
return cgrp->nr_populated_csets;
}
-bool cgroup_is_threaded(struct cgroup *cgrp)
+static bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
}
@@ -390,7 +435,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
}
/* is @cgrp root of a threaded subtree? */
-bool cgroup_is_thread_root(struct cgroup *cgrp)
+static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
/* thread root should be a domain */
if (cgroup_is_threaded(cgrp))
@@ -489,28 +534,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
- * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get @cgrp's css associated with @ss. If the css doesn't exist
- * or is offline, %NULL is returned.
- */
-static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
- css = cgroup_css(cgrp, ss);
- if (css && !css_tryget_online(css))
- css = NULL;
- rcu_read_unlock();
-
- return css;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -613,7 +636,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
+ cgroup_get(cgrp);
}
/**
@@ -649,9 +672,22 @@ int cgroup_task_count(const struct cgroup *cgrp)
return count;
}
+static struct cgroup *kn_priv(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ /*
+ * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT.
+ * Therefore it is always safe to dereference this pointer outside of a
+ * RCU section.
+ */
+ parent = rcu_dereference_check(kn->__parent,
+ kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT);
+ return parent->priv;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
/*
@@ -675,7 +711,7 @@ EXPORT_SYMBOL_GPL(of_css);
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_[tree_]mutex.
+ * Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -685,21 +721,6 @@ EXPORT_SYMBOL_GPL(of_css);
else
/**
- * for_each_e_css - iterate all effective css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css_by_mask(cgrp, \
- cgroup_subsys[(ssid)]))) \
- ; \
- else
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -765,7 +786,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -921,7 +943,8 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -939,7 +962,7 @@ static void css_set_move_task(struct task_struct *task,
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
@@ -1080,7 +1103,7 @@ static bool compare_css_sets(struct css_set *cset,
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+ struct cgroup_subsys_state **template)
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
@@ -1240,7 +1263,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1302,11 +1326,41 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1330,13 +1384,14 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- kfree(root);
+ kfree_rcu(root, rcu);
}
static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ int ret;
trace_cgroup_destroy_root(root);
@@ -1345,6 +1400,10 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
@@ -1362,90 +1421,129 @@ static void cgroup_destroy_root(struct cgroup_root *root)
spin_unlock_irq(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
+
+ if (!have_favordynmods)
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
- cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
+ struct cgroup *res_cgroup = NULL;
- rcu_read_lock();
-
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1587,9 +1685,9 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
@@ -1619,7 +1717,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
@@ -1634,7 +1732,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1657,7 +1755,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
- del_timer_sync(&cfile->notify_timer);
+ timer_delete_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1677,13 +1775,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (css_is_self(css)) {
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1702,18 +1804,31 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
struct cftype *cfts, *failed_cfts;
int ret;
- if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ if (css->flags & CSS_VISIBLE)
return 0;
- if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (css_is_self(css)) {
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ return ret;
+ }
+ }
+ } else {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ if (ret < 0)
+ return ret;
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1740,7 +1855,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1784,7 +1899,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
@@ -1799,19 +1915,26 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ css->cgroup = dcgrp;
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
- spin_unlock_irq(&css_set_lock);
-
- if (ss->css_rstat_flush) {
- list_del_rcu(&css->rstat_css_node);
- list_add_rcu(&css->rstat_css_node,
- &dcgrp->rstat_css_list);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
}
+ spin_unlock_irq(&css_set_lock);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
@@ -1852,7 +1975,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
- if (len >= PATH_MAX)
+ if (len == -E2BIG)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
@@ -1864,15 +1987,21 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
{}
};
@@ -1890,16 +2019,32 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
@@ -1908,6 +2053,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@@ -1917,6 +2065,16 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
}
}
@@ -1924,10 +2082,16 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
return 0;
}
@@ -1954,12 +2118,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
- INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+#ifdef CONFIG_CGROUP_BPF
+ for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+ cgrp->bpf.revisions[i] = 1;
+#endif
+
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -1969,12 +2137,13 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -2019,21 +2188,22 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
- KERNFS_ROOT_SUPPORT_USER_XATTR,
+ KERNFS_ROOT_SUPPORT_USER_XATTR |
+ KERNFS_ROOT_INVARIANT_PARENT,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
- ret = cgroup_rstat_init(root_cgrp);
+ ret = css_rstat_init(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -2041,8 +2211,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
- ret = cgroup_bpf_inherit(root_cgrp);
- WARN_ON_ONCE(ret);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE, root_cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
@@ -2051,7 +2222,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
- list_add(&root->root_list, &cgroup_roots);
+ list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
@@ -2073,7 +2244,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto out;
exit_stats:
- cgroup_rstat_exit(root_cgrp);
+ css_rstat_exit(&root_cgrp->self);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -2107,13 +2278,13 @@ int cgroup_do_get_tree(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
@@ -2150,7 +2321,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2196,6 +2367,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+ if (have_favordynmods)
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
return 0;
}
@@ -2211,10 +2386,8 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- cgroup_bpf_offline(&root->cgrp);
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
- }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2235,10 +2408,38 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
+enum cpuset_param {
+ Opt_cpuset_v2_mode,
+};
+
+static const struct fs_parameter_spec cpuset_fs_parameters[] = {
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ {}
+};
+
+static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ return 0;
+ }
+ return -EINVAL;
+}
+
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
+ .parse_param = cpuset_parse_param,
};
/*
@@ -2275,6 +2476,7 @@ static int cpuset_init_fs_context(struct fs_context *fc)
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
+ .parameters = cpuset_fs_parameters,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
@@ -2292,56 +2494,92 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
- * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
- * @task: target task
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Determine @task's cgroup on the first (the one with the lowest non-zero
- * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
- * function grabs cgroup_mutex and shouldn't be used inside locks used by
- * cgroup controller callbacks.
- *
- * Return value is the same as kernfs_path().
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
*/
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
- struct cgroup_root *root;
- struct cgroup *cgrp;
- int hierarchy_id = 1;
- int ret;
-
- mutex_lock(&cgroup_mutex);
- spin_lock_irq(&css_set_lock);
+ cpus_read_lock();
- root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+}
- if (root) {
- cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
- } else {
- /* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
+ */
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
+{
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
}
- spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ cpus_read_unlock();
}
-EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
@@ -2425,7 +2663,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/*
* This function may be called both before and
- * after cgroup_taskset_migrate(). The two cases
+ * after cgroup_migrate_execute(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
@@ -2570,10 +2808,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2597,21 +2831,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2651,7 +2891,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2664,7 +2904,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2689,7 +2929,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2708,7 +2948,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2716,8 +2956,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2754,19 +2994,17 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct task_struct *task;
/*
- * Prevent freeing of tasks while we take a snapshot. Tasks that are
- * already PF_EXITING could be freed from underneath us unless we
- * take an rcu_read_lock.
+ * The following thread iteration should be inside an RCU critical
+ * section to prevent tasks from being freed while taking the snapshot.
+ * spin_lock_irq() implies RCU critical section here.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -2789,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
/* look up all src csets */
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
@@ -2813,8 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ enum cgroup_attach_lock_mode *lock_mode)
{
struct task_struct *tsk;
pid_t pid;
@@ -2822,28 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- /*
- * If we migrate a single thread, we don't care about threadgroup
- * stability. If the thread is `current`, it won't exit(2) under our
- * hands or change PID through exec(2). We exclude
- * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
- * callers by cgroup_mutex.
- * Therefore, we can skip the global lock.
- */
- lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
-
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -2860,36 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
-
get_task_struct(tsk);
- goto out_unlock_rcu;
+ rcu_read_unlock();
-out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
}
+
+ return tsk;
+
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
{
- struct cgroup_subsys *ss;
- int ssid;
+ cgroup_attach_unlock(lock_mode, task);
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
-
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
- for_each_subsys(ss, ssid)
- if (ss->post_attach)
- ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2941,29 +3183,54 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2975,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(lock_mode, NULL);
return ret;
}
@@ -2996,7 +3263,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
int ssid;
restart:
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
@@ -3010,7 +3277,7 @@ restart:
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
schedule();
finish_wait(&dsct->offline_waitq, &wait);
@@ -3212,11 +3479,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3565,18 +3828,90 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
return 0;
}
-static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (css && !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
@@ -3593,14 +3928,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
return ret;
}
+static int cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_local_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_local_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+#endif
+
static int cpu_stat_show(struct seq_file *seq, void *v)
{
- struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
- ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+ ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
+static int cpu_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
@@ -3609,27 +3974,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@@ -3643,15 +4008,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
- new = psi_trigger_create(psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&ctx->psi.trigger, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3661,21 +4031,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3690,11 +4125,14 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
- psi_trigger_replace(&ctx->psi.trigger, NULL);
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -3748,7 +4186,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -3764,10 +4202,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -3848,13 +4282,14 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
+ of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
@@ -3866,7 +4301,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
- * cgroup.procs and cgroup.subtree_control.
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@ -3967,20 +4402,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -3993,25 +4414,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
- int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_fsuid(), current_fsgid(),
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
@@ -4047,8 +4461,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4113,21 +4525,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
@@ -4141,30 +4557,29 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
- return 0;
}
/**
@@ -4180,12 +4595,16 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
- int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
- mutex_lock(&cgroup_mutex);
- ret = cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
+ cgroup_lock();
+ cgroup_rm_cftypes_locked(cfts);
+ cgroup_unlock();
+ return 0;
}
/**
@@ -4202,7 +4621,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -4216,14 +4635,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -4285,6 +4704,27 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
+EXPORT_SYMBOL_GPL(cgroup_file_notify);
+
+/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
/**
* css_next_child - find the next child of a given css
@@ -4361,8 +4801,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -4410,8 +4851,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct rightmost descendant as
- * long as @pos is accessible.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct rightmost descendant as long as @pos
+ * is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
@@ -4455,9 +4897,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -4700,9 +5142,11 @@ repeat:
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
+ unsigned long irqflags;
+
memset(it, 0, sizeof(*it));
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
it->ss = css->ss;
it->flags = flags;
@@ -4716,7 +5160,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
css_task_iter_advance(it);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
/**
@@ -4729,12 +5173,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4747,7 +5193,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
css_task_iter_advance(it);
}
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
return it->cur_task;
}
@@ -4760,11 +5206,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
void css_task_iter_end(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_cset) {
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
if (it->cur_dcset)
@@ -4853,7 +5301,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
@@ -4916,15 +5364,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
- const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ enum cgroup_attach_lock_mode lock_mode;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4939,18 +5386,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* permissions using the credentials from file open to protect against
* inherited fd attacks.
*/
- saved_cred = override_creds(of->file->f_cred);
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb,
- threadgroup, ctx->ns);
- revert_creds(saved_cred);
+ scoped_with_creds(of->file->f_cred)
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
if (ret)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5032,6 +5478,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
@@ -5046,10 +5497,18 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat.local",
+ .seq_show = cpu_local_stat_show,
+ },
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5057,7 +5516,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5065,12 +5524,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -5091,7 +5565,7 @@ static struct cftype cgroup_base_files[] = {
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
- * css_free_work_fn().
+ * css_free_rwork_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
@@ -5105,8 +5579,9 @@ static void css_free_rwork_fn(struct work_struct *work)
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
+ css_rstat_exit(css);
- if (ss) {
+ if (!css_is_self(css)) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
@@ -5120,8 +5595,10 @@ static void css_free_rwork_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup1_pidlist_destroy_all(cgrp);
+ if (!cgroup_on_dfl(cgrp))
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
@@ -5133,7 +5610,6 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -5153,28 +5629,41 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
- if (ss) {
- /* css release path */
- if (!list_empty(&css->rstat_css_node)) {
- cgroup_rstat_flush(cgrp);
- list_del_rcu(&css->rstat_css_node);
- }
+ if (!css_is_self(css)) {
+ struct cgroup *parent_cgrp;
+
+ css_rstat_flush(css);
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
} else {
struct cgroup *tcgrp;
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- cgroup_rstat_flush(cgrp);
+ css_rstat_flush(&cgrp->self);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5194,10 +5683,10 @@ static void css_release_work_fn(struct work_struct *work)
NULL);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5206,7 +5695,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5222,7 +5711,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
- INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5231,9 +5719,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (ss->css_rstat_flush)
- list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
-
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -5252,8 +5737,11 @@ static int online_css(struct cgroup_subsys_state *css)
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
- if (css->parent)
+ if (css->parent) {
atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
}
return ret;
}
@@ -5275,6 +5763,16 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
@@ -5313,6 +5811,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
goto err_free_css;
css->id = err;
+ err = css_rstat_init(css);
+ if (err)
+ goto err_free_css;
+
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -5326,16 +5828,14 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
/*
* The returned cgroup is fully initialized including its control mask, but
- * it isn't associated with its kernfs_node and doesn't have the control
- * mask applied.
+ * it doesn't have the control mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
@@ -5343,12 +5843,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
- int level = parent->level + 1;
+ int i, level = parent->level + 1;
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5356,15 +5855,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
-
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_stat_exit;
+ goto out_cancel_ref;
}
cgrp->kn = kn;
@@ -5374,19 +5871,27 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->root = root;
cgrp->level = level;
- ret = psi_cgroup_alloc(cgrp);
+ /*
+ * Now that init_cgroup_housekeeping() has been called and cgrp->self
+ * is setup, it is safe to perform rstat initialization on it.
+ */
+ ret = css_rstat_init(&cgrp->self);
if (ret)
goto out_kernfs_remove;
- ret = cgroup_bpf_inherit(cgrp);
+ ret = psi_cgroup_alloc(cgrp);
if (ret)
- goto out_psi_free;
+ goto out_stat_exit;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestors[tcgrp->level] = tcgrp;
/*
* New cgroup inherits effective freeze counter, and
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
@@ -5395,27 +5900,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
set_bit(CGRP_FROZEN, &cgrp->flags);
}
- spin_lock_irq(&css_set_lock);
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
-
- if (tcgrp != cgrp) {
- tcgrp->nr_descendants++;
-
- /*
- * If the new cgroup is frozen, all ancestor cgroups
- * get a new frozen descendant, but their state can't
- * change because of this.
- */
- if (cgrp->freezer.e_freeze)
- tcgrp->freezer.nr_frozen_descendants++;
- }
- }
- spin_unlock_irq(&css_set_lock);
-
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5424,7 +5912,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->self.serial_nr = css_serial_nr_next++;
+ ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_psi_free;
+
/* allocation complete, commit to creation */
+ spin_lock_irq(&css_set_lock);
+ for (i = 0; i < level; i++) {
+ tcgrp = cgrp->ancestors[i];
+ tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups get a new
+ * frozen descendant, but their state can't change because of
+ * this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
+ spin_unlock_irq(&css_set_lock);
+
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent);
@@ -5442,10 +5952,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
out_psi_free:
psi_cgroup_free(cgrp);
+out_stat_exit:
+ css_rstat_exit(&cgrp->self);
out_kernfs_remove:
kernfs_remove(cgrp->kn);
-out_stat_exit:
- cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5457,7 +5967,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
- int level = 1;
+ int level = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -5465,7 +5975,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
- if (level > cgroup->max_depth)
+ if (level >= cgroup->max_depth)
goto fail;
level++;
@@ -5501,15 +6011,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
}
/*
- * This extra ref will be put in cgroup_free_fn() and guarantees
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(cgrp->kn);
- if (ret)
- goto out_destroy;
-
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
@@ -5543,7 +6049,7 @@ static void css_killed_work_fn(struct work_struct *work)
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
do {
offline_css(css);
@@ -5552,7 +6058,7 @@ static void css_killed_work_fn(struct work_struct *work)
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* css kill confirmation processing requires process context, bounce */
@@ -5563,7 +6069,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -5583,6 +6089,12 @@ static void kill_css(struct cgroup_subsys_state *css)
if (css->flags & CSS_DYING)
return;
+ /*
+ * Call css_killed(), if defined, before setting the CSS_DYING flag
+ */
+ if (css->ss->css_killed)
+ css->ss->css_killed(css);
+
css->flags |= CSS_DYING;
/*
@@ -5640,7 +6152,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
- int ssid;
+ int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
@@ -5662,7 +6174,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
- * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * cgroup_kn_lock_live(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
@@ -5680,11 +6192,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
- for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
@@ -5698,7 +6210,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
- cgroup_bpf_offline(cgrp);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5736,7 +6250,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
pr_debug("Initializing cgroup subsys %s\n", ss->name);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
@@ -5760,6 +6274,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
+
+ BUG_ON(ss_rstat_init(ss));
+ BUG_ON(css_rstat_init(css));
}
/* Update the init_css_set to contain a subsys
@@ -5780,7 +6297,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
BUG_ON(online_css(css));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/**
@@ -5808,6 +6325,8 @@ int __init cgroup_init_early(void)
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+ WARN(ss->early_init && ss->css_rstat_flush,
+ "cgroup rstat cannot be used with early init subsystem\n");
ss->id = i;
ss->name = cgroup_subsys_name[i];
@@ -5833,19 +6352,15 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_rstat_boot();
-
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+ BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/*
* Add init_css_set to the hash table so that dfl_root can link to
@@ -5854,9 +6369,11 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
+ cgroup_bpf_lifetime_notifier_init();
+
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for_each_subsys(ss, ssid) {
if (ss->early_init) {
@@ -5882,8 +6399,8 @@ int __init cgroup_init(void)
continue;
if (cgroup1_ssid_disabled(ssid))
- printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
- ss->name);
+ pr_info("Disabling %s control group subsystem in v1 mounts\n",
+ ss->legacy_name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
@@ -5908,9 +6425,9 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css_populate_dir(init_css_set.subsys[ssid]);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -5922,10 +6439,11 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
@@ -5939,8 +6457,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
@@ -5957,18 +6481,24 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
- * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * __cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
- * On success return the cgrp, on failure return NULL
+ * On success return the cgrp or ERR_PTR on failure
+ * There are no cgroup NS restrictions.
*/
-struct cgroup *cgroup_get_from_id(u64 id)
+struct cgroup *__cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp = NULL;
+ struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out;
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
rcu_read_lock();
@@ -5977,9 +6507,33 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-
kernfs_put(kn);
-out:
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+ return cgrp;
+}
+
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct cgroup *cgrp, *root_cgrp;
+
+ cgrp = __cgroup_get_from_id(id);
+ if (IS_ERR(cgrp))
+ return cgrp;
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6001,7 +6555,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- mutex_lock(&cgroup_mutex);
+ rcu_read_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -6009,7 +6563,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
+ continue;
+
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -6022,9 +6581,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
-
- cgrp = task_cgroup_from_root(tsk, root);
-
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
@@ -6037,7 +6593,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
@@ -6056,7 +6612,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ rcu_read_unlock();
kfree(buf);
out:
return retval;
@@ -6075,16 +6631,37 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
@@ -6116,16 +6693,19 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
- struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6133,14 +6713,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return 0;
}
- f = fget_raw(kargs->cgroup);
- if (!f) {
+ CLASS(fd_raw, f)(kargs->cgroup);
+ if (fd_empty(f)) {
ret = -EBADF;
goto err;
}
- sb = f->f_path.dentry->d_sb;
+ sb = fd_file(f)->f_path.dentry->d_sb;
- dst_cgrp = cgroup_get_from_file(f);
+ dst_cgrp = cgroup_get_from_file(fd_file(f));
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
@@ -6161,6 +6741,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
@@ -6174,15 +6768,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
}
put_css_set(cset);
- fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
- mutex_unlock(&cgroup_mutex);
- if (f)
- fput(f);
+ cgroup_unlock();
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
@@ -6201,19 +6792,18 @@ err:
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
- cgroup_threadgroup_change_end(current);
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
- if (kargs->flags & CLONE_INTO_CGROUP) {
- struct cgroup *cgrp = kargs->cgrp;
- struct css_set *cset = kargs->cset;
-
- mutex_unlock(&cgroup_mutex);
+ cgroup_threadgroup_change_end(current);
- if (cset) {
- put_css_set(cset);
- kargs->cset = NULL;
- }
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ cgroup_unlock();
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
@@ -6224,6 +6814,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6286,6 +6877,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
@@ -6294,6 +6886,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6307,10 +6900,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6345,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
@@ -6376,40 +6972,101 @@ void cgroup_post_fork(struct task_struct *child,
}
/**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+static void do_cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
}
-void cgroup_release(struct task_struct *task)
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ per_cpu(cgrp_dead_tasks_iwork, cpu) =
+ IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+ }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
@@ -6417,16 +7074,19 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
-
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
}
-void cgroup_free(struct task_struct *task)
+void cgroup_task_free(struct task_struct *task)
{
struct css_set *cset = task_css_set(task);
+
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
+
put_css_set(cset);
}
@@ -6473,6 +7133,12 @@ static int __init enable_cgroup_debug(char *str)
}
__setup("cgroup_debug", enable_cgroup_debug);
+static int __init cgroup_favordynmods_setup(char *str)
+{
+ return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6540,8 +7206,10 @@ struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
if (!kn)
goto out;
@@ -6566,25 +7234,39 @@ out:
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
+{
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ return cgroup_v1v2_get_from_file(fd_file(f));
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
struct cgroup *cgroup_get_from_fd(int fd)
{
- struct cgroup *cgrp;
- struct file *f;
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
- f = fget_raw(fd);
- if (!f)
- return ERR_PTR(-EBADF);
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
- cgrp = cgroup_get_from_file(f);
- fput(f);
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6699,9 +7381,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
-
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
@@ -6721,8 +7400,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6738,8 +7420,11 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
+ "favordynmods\n"
"memory_localevents\n"
- "memory_recursiveprot\n");
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
new file mode 100644
index 000000000000..01976c8e7d49
--- /dev/null
+++ b/kernel/cgroup/cpuset-internal.h
@@ -0,0 +1,308 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __CPUSET_INTERNAL_H
+#define __CPUSET_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/spinlock.h>
+#include <linux/union_find.h>
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time64_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+ PERR_HKEEPING,
+ PERR_ACCESS,
+ PERR_REMOTE,
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierarchy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
+
+ /*
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
+ *
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
+ *
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
+ */
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
+ */
+ cpumask_var_t exclusive_cpus;
+
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
+
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Whether cpuset is a remote partition.
+ * It used to be a list anchoring all remote partitions — we can switch back
+ * to a list if we need to iterate over the remote partitions.
+ */
+ bool remote_partition;
+
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
+
+ /* Used to merge intersecting subsets for generate_sched_domains */
+ struct uf_node node;
+};
+
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuset, css) : NULL;
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+/* convenient tests for these bits */
+static inline bool is_cpuset_online(struct cpuset *cs)
+{
+ return css_is_online(&cs->css) && !css_is_dying(&cs->css);
+}
+
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
+void rebuild_sched_domains_locked(void);
+void cpuset_callback_lock_irq(void);
+void cpuset_callback_unlock_irq(void);
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
+void cpuset_update_tasks_nodemask(struct cpuset *cs);
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
+
+/*
+ * cpuset-v1.c
+ */
+#ifdef CONFIG_CPUSETS_V1
+extern struct cftype cpuset1_files[];
+void fmeter_init(struct fmeter *fmp);
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk);
+void cpuset1_update_tasks_flags(struct cpuset *cs);
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated);
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+#else
+static inline void fmeter_init(struct fmeter *fmp) {}
+static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk) {}
+static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
+static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated) {}
+static inline int cpuset1_validate_change(struct cpuset *cur,
+ struct cpuset *trial) { return 0; }
+#endif /* CONFIG_CPUSETS_V1 */
+
+#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
new file mode 100644
index 000000000000..12e76774c75b
--- /dev/null
+++ b/kernel/cgroup/cpuset-v1.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cgroup-internal.h"
+#include "cpuset-internal.h"
+
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+ struct work_struct work;
+ struct cpuset *cs;
+};
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure". Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ */
+
+void __cpuset_memory_pressure_bump(void)
+{
+ rcu_read_lock();
+ fmeter_markevent(&task_cs(current)->fmeter);
+ rcu_read_unlock();
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+#ifdef CONFIG_SMP
+ if (val < -1 || val > sched_domain_level_max + 1)
+ return -EINVAL;
+#endif
+
+ if (val != cs->relax_domain_level) {
+ cs->relax_domain_level = val;
+ if (!cpumask_empty(cs->cpus_allowed) &&
+ is_sched_load_balance(cs))
+ rebuild_sched_domains_locked();
+ }
+
+ return 0;
+}
+
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
+
+ cpuset_full_lock();
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_full_unlock();
+ return retval;
+}
+
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
+ * if on default hierarchy.
+ */
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk)
+{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
+ if (is_spread_page(cs))
+ task_set_spread_page(tsk);
+ else
+ task_clear_spread_page(tsk);
+
+ if (is_spread_slab(cs))
+ task_set_spread_slab(tsk);
+ else
+ task_clear_spread_slab(tsk);
+}
+
+/**
+ * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+void cpuset1_update_tasks_flags(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset1_update_task_spread_flags(cs, task);
+ css_task_iter_end(&it);
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
+
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
+}
+
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+ struct cpuset_remove_tasks_struct *s;
+
+ s = container_of(work, struct cpuset_remove_tasks_struct, work);
+ remove_tasks_in_empty_cpuset(s->cs);
+ css_put(&s->cs->css);
+ kfree(s);
+}
+
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ cpuset_callback_lock_irq();
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ cpuset_callback_unlock_irq();
+
+ /*
+ * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migrated to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ cpuset_update_tasks_cpumask(cs, new_cpus);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ cpuset_update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Execute it asynchronously using workqueue.
+ */
+ if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ css_tryget_online(&cs->css)) {
+ struct cpuset_remove_tasks_struct *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (WARN_ON_ONCE(!s)) {
+ css_put(&cs->css);
+ return;
+ }
+
+ s->cs = cs;
+ INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+ schedule_work(&s->work);
+ }
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set. Call holding cpuset_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_free;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ return is_cpu_exclusive(cs);
+ case FILE_MEM_EXCLUSIVE:
+ return is_mem_exclusive(cs);
+ case FILE_MEM_HARDWALL:
+ return is_mem_hardwall(cs);
+ case FILE_SCHED_LOAD_BALANCE:
+ return is_sched_load_balance(cs);
+ case FILE_MEMORY_MIGRATE:
+ return is_memory_migrate(cs);
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ return cpuset_memory_pressure_enabled;
+ case FILE_MEMORY_PRESSURE:
+ return fmeter_getrate(&cs->fmeter);
+ case FILE_SPREAD_PAGE:
+ return is_spread_page(cs);
+ case FILE_SPREAD_SLAB:
+ return is_spread_slab(cs);
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = 0;
+
+ cpuset_full_lock();
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_HARDWALL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
+ break;
+ case FILE_SCHED_LOAD_BALANCE:
+ pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
+ retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+ break;
+ case FILE_MEMORY_MIGRATE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
+ break;
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
+ cpuset_memory_pressure_enabled = !!val;
+ break;
+ case FILE_SPREAD_PAGE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
+ break;
+ case FILE_SPREAD_SLAB:
+ pr_warn_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_full_unlock();
+ return retval;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+struct cftype cpuset1_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ },
+
+ {
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpu_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_CPU_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_hardwall",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_HARDWALL,
+ },
+
+ {
+ .name = "sched_load_balance",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_LOAD_BALANCE,
+ },
+
+ {
+ .name = "sched_relax_domain_level",
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
+ .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ },
+
+ {
+ .name = "memory_migrate",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_MIGRATE,
+ },
+
+ {
+ .name = "memory_pressure",
+ .read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
+ },
+
+ {
+ .name = "memory_spread_page",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_PAGE,
+ },
+
+ {
+ /* obsolete, may be removed in the future */
+ .name = "memory_spread_slab",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_SLAB,
+ },
+
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
+
+ { } /* terminate */
+};
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dc653ab26e50..6e6eb09b8db6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,174 +21,120 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
+#include "cpuset-internal.h"
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/cpuset.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/fs_context.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
-#include <linux/seq_file.h>
#include <linux/security.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/time64.h>
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
-/* See "Frequency meter" comments, below. */
-
-struct fmeter {
- int cnt; /* unprocessed events count */
- int val; /* most recent output value */
- time64_t time; /* clock (secs) when val computed */
- spinlock_t lock; /* guards read or write of above */
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
+ [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
+ [PERR_ACCESS] = "Enable partition not permitted",
+ [PERR_REMOTE] = "Have remote partition underneath",
};
-struct cpuset {
- struct cgroup_subsys_state css;
-
- unsigned long flags; /* "unsigned long" so bitops work */
-
- /*
- * On default hierarchy:
- *
- * The user-configured masks can only be changed by writing to
- * cpuset.cpus and cpuset.mems, and won't be limited by the
- * parent masks.
- *
- * The effective masks is the real masks that apply to the tasks
- * in the cpuset. They may be changed if the configured masks are
- * changed or hotplug happens.
- *
- * effective_mask == configured_mask & parent's effective_mask,
- * and if it ends up empty, it will inherit the parent's mask.
- *
- *
- * On legacy hierarchy:
- *
- * The user-configured masks are always the same with effective masks.
- */
-
- /* user-configured CPUs and Memory Nodes allow to tasks */
- cpumask_var_t cpus_allowed;
- nodemask_t mems_allowed;
-
- /* effective CPUs and Memory Nodes allow to tasks */
- cpumask_var_t effective_cpus;
- nodemask_t effective_mems;
-
- /*
- * CPUs allocated to child sub-partitions (default hierarchy only)
- * - CPUs granted by the parent = effective_cpus U subparts_cpus
- * - effective_cpus and subparts_cpus are mutually exclusive.
- *
- * effective_cpus contains only onlined CPUs, but subparts_cpus
- * may have offlined ones.
- */
- cpumask_var_t subparts_cpus;
-
- /*
- * This is old Memory Nodes tasks took on.
- *
- * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
- * - A new cpuset's old_mems_allowed is initialized when some
- * task is moved into it.
- * - old_mems_allowed is used in cpuset_migrate_mm() when we change
- * cpuset.mems_allowed and have tasks' nodemask updated, and
- * then old_mems_allowed is updated to mems_allowed.
- */
- nodemask_t old_mems_allowed;
-
- struct fmeter fmeter; /* memory_pressure filter */
-
- /*
- * Tasks are being attached to this cpuset. Used to prevent
- * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
- */
- int attach_in_progress;
-
- /* partition number for rebuild_sched_domains() */
- int pn;
-
- /* for custom sched domain */
- int relax_domain_level;
+/*
+ * For local partitions, update to subpartitions_cpus & isolated_cpus is done
+ * in update_parent_effective_cpumask(). For remote partitions, it is done in
+ * the remote_partition_*() and remote_cpus_update() helpers.
+ */
+/*
+ * Exclusive CPUs distributed out to local or remote sub-partitions of
+ * top_cpuset
+ */
+static cpumask_var_t subpartitions_cpus;
- /* number of CPUs in subparts_cpus */
- int nr_subparts_cpus;
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
- /* partition root state */
- int partition_root_state;
+/*
+ * isolated_cpus updating flag (protected by cpuset_mutex)
+ * Set if isolated_cpus is going to be updated in the current
+ * cpuset_mutex crtical section.
+ */
+static bool isolated_cpus_updating;
- /*
- * Default hierarchy only:
- * use_parent_ecpus - set if using parent's effective_cpus
- * child_ecpus_count - # of children with use_parent_ecpus set
- */
- int use_parent_ecpus;
- int child_ecpus_count;
+/*
+ * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
+ */
+static cpumask_var_t boot_hk_cpus;
+static bool have_boot_isolcpus;
- /* Handle for cpuset.cpus.partition */
- struct cgroup_file partition_file;
-};
+/*
+ * A flag to force sched domain rebuild at the end of an operation.
+ * It can be set in
+ * - update_partition_sd_lb()
+ * - update_cpumasks_hier()
+ * - cpuset_update_flag()
+ * - cpuset_hotplug_update_tasks()
+ * - cpuset_handle_hotplug()
+ *
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ *
+ * Note that update_relax_domain_level() in cpuset-v1.c can still call
+ * rebuild_sched_domains_locked() directly without using this flag.
+ */
+static bool force_sd_rebuild;
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
+ *
+ * There are 2 types of partitions - local or remote. Local partitions are
+ * those whose parents are partition root themselves. Setting of
+ * cpuset.cpus.exclusive are optional in setting up local partitions.
+ * Remote partitions are those whose parents are not partition roots. Passing
+ * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
+ * nodes are mandatory in creating a remote partition.
+ *
+ * For simplicity, a local partition can be created under a local or remote
+ * partition but a remote partition cannot have any partition root in its
+ * ancestor chain except the cgroup root.
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -199,141 +145,96 @@ struct tmpmasks {
cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
-static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct cpuset, css) : NULL;
-}
-
-/* Retrieve the cpuset for a task */
-static inline struct cpuset *task_cs(struct task_struct *task)
-{
- return css_cs(task_css(task, cpuset_cgrp_id));
-}
-
-static inline struct cpuset *parent_cs(struct cpuset *cs)
-{
- return css_cs(cs->css.parent);
-}
-
-/* bits in struct cpuset flags field */
-typedef enum {
- CS_ONLINE,
- CS_CPU_EXCLUSIVE,
- CS_MEM_EXCLUSIVE,
- CS_MEM_HARDWALL,
- CS_MEMORY_MIGRATE,
- CS_SCHED_LOAD_BALANCE,
- CS_SPREAD_PAGE,
- CS_SPREAD_SLAB,
-} cpuset_flagbits_t;
-
-/* convenient tests for these bits */
-static inline bool is_cpuset_online(struct cpuset *cs)
+void inc_dl_tasks_cs(struct task_struct *p)
{
- return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
-}
+ struct cpuset *cs = task_cs(p);
-static inline int is_cpu_exclusive(const struct cpuset *cs)
-{
- return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ cs->nr_deadline_tasks++;
}
-static inline int is_mem_exclusive(const struct cpuset *cs)
+void dec_dl_tasks_cs(struct task_struct *p)
{
- return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
-}
+ struct cpuset *cs = task_cs(p);
-static inline int is_mem_hardwall(const struct cpuset *cs)
-{
- return test_bit(CS_MEM_HARDWALL, &cs->flags);
+ cs->nr_deadline_tasks--;
}
-static inline int is_sched_load_balance(const struct cpuset *cs)
+static inline bool is_partition_valid(const struct cpuset *cs)
{
- return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-}
-
-static inline int is_memory_migrate(const struct cpuset *cs)
-{
- return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+ return cs->partition_root_state > 0;
}
-static inline int is_spread_page(const struct cpuset *cs)
+static inline bool is_partition_invalid(const struct cpuset *cs)
{
- return test_bit(CS_SPREAD_PAGE, &cs->flags);
+ return cs->partition_root_state < 0;
}
-static inline int is_spread_slab(const struct cpuset *cs)
+static inline bool cs_is_member(const struct cpuset *cs)
{
- return test_bit(CS_SPREAD_SLAB, &cs->flags);
+ return cs->partition_root_state == PRS_MEMBER;
}
-static inline int is_partition_root(const struct cpuset *cs)
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
{
- return cs->partition_root_state > 0;
+ if (cs->partition_root_state > 0)
+ cs->partition_root_state = -cs->partition_root_state;
}
/*
* Send notification event of whenever partition_root_state changes.
*/
-static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
- if (old_prs != new_prs)
- cgroup_file_notify(&cs->partition_file);
-}
-
-static struct cpuset top_cpuset = {
- .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
-};
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
-/**
- * cpuset_for_each_child - traverse online children of a cpuset
- * @child_cs: loop cursor pointing to the current child
- * @pos_css: used for iteration
- * @parent_cs: target cpuset to walk children of
- *
- * Walk @child_cs through the online children of @parent_cs. Must be used
- * with RCU read locked.
- */
-#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
- css_for_each_child((pos_css), &(parent_cs)->css) \
- if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
+}
-/**
- * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
- * @des_cs: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @root_cs: target cpuset to walk ancestor of
+/*
+ * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
+ * using cpu_online_mask as much as possible. An active CPU is always an online
+ * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
+ * during hotplug operations. A CPU is marked active at the last stage of CPU
+ * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
+ * will be called to update the sched domains so that the scheduler can move
+ * a normal task to a newly active CPU or remove tasks away from a newly
+ * inactivated CPU. The online bit is set much earlier in the CPU bringup
+ * process and cleared much later in CPU teardown.
*
- * Walk @des_cs through the online descendants of @root_cs. Must be used
- * with RCU read locked. The caller may modify @pos_css by calling
- * css_rightmost_descendant() to skip subtree. @root_cs is included in the
- * iteration and the first node to be visited.
+ * If cpu_online_mask is used while a hotunplug operation is happening in
+ * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
- css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
- if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+static struct cpuset top_cpuset = {
+ .flags = BIT(CS_CPU_EXCLUSIVE) |
+ BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
+ .partition_root_state = PRS_ROOT,
+ .relax_domain_level = -1,
+ .remote_partition = false,
+};
/*
- * There are two global locks guarding cpuset structures - cpuset_rwsem and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_rwsem write lock. Other
- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
- * prevent change to cpuset structures.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
+ * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * structures. Note that cpuset_mutex needs to be a mutex as it is used in
+ * paths that rely on priority inheritance (e.g. scheduler - on RT) for
+ * correctness.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets. It can perform various checks on the cpuset structure
- * first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_rwsem. While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
+ * cpuset_mutex, it blocks others, ensuring that it is the only task able to
+ * also acquire callback_lock and be able to modify cpusets. It can perform
+ * various checks on the cpuset structure first, knowing nothing will change.
+ * It can also allocate memory while just holding cpuset_mutex. While it is
+ * performing these checks, various callback routines can briefly acquire
+ * callback_lock to query cpusets. Once it is ready to make the changes, it
+ * takes callback_lock, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
@@ -347,35 +248,60 @@ static struct cpuset top_cpuset = {
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_lock across
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
+ */
+
+static DEFINE_MUTEX(cpuset_mutex);
+
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
*
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
*/
+void cpuset_lock(void)
+{
+ mutex_lock(&cpuset_mutex);
+}
-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+void cpuset_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+}
-void cpuset_read_lock(void)
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
{
- percpu_down_read(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
}
-void cpuset_read_unlock(void)
+void cpuset_full_unlock(void)
{
- percpu_up_read(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
}
static DEFINE_SPINLOCK(callback_lock);
-static struct workqueue_struct *cpuset_migrate_mm_wq;
+void cpuset_callback_lock_irq(void)
+{
+ spin_lock_irq(&callback_lock);
+}
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+void cpuset_callback_unlock_irq(void)
+{
+ spin_unlock_irq(&callback_lock);
+}
+
+static struct workqueue_struct *cpuset_migrate_mm_wq;
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
@@ -383,7 +309,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
- static_branch_enable(&cpusets_insane_config_key);
+ static_branch_enable_cpuslocked(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
@@ -391,6 +317,32 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
}
/*
+ * decrease cs->attach_in_progress.
+ * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
+ */
+static inline void dec_attach_in_progress_locked(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+}
+
+static inline void dec_attach_in_progress(struct cpuset *cs)
+{
+ mutex_lock(&cpuset_mutex);
+ dec_attach_in_progress_locked(cs);
+ mutex_unlock(&cpuset_mutex);
+}
+
+static inline bool cpuset_v2(void)
+{
+ return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
+ cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+}
+
+/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
@@ -400,10 +352,67 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline bool is_in_v2_mode(void)
{
- return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ return cpuset_v2() ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * @cs should be a valid partition root or going to become a partition root.
+ * @excluded_child should be non-NULL when this cpuset is going to become a
+ * partition itself.
+ *
+ * Note that a remote partition is not allowed underneath a valid local
+ * or remote partition. So if a non-partition root child is populated,
+ * the whole partition is considered populated.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * We cannot call cs_is_populated(cs) directly, as
+ * nr_populated_domain_children may include populated
+ * csets from descendants that are partitions.
+ */
+ if (cs->css.cgroup->nr_populated_csets ||
+ cs->attach_in_progress)
+ return true;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ if (cp == cs || cp == excluded_child)
+ continue;
+
+ if (is_partition_valid(cp)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (cpuset_is_populated(cp)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
* Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online and are capable of running the task. If none are found,
@@ -411,38 +420,26 @@ static inline bool is_in_v2_mode(void)
* appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_active_mask.
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct task_struct *tsk,
+static void guarantee_active_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
- if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
- cpumask_copy(pmask, cpu_online_mask);
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
rcu_read_lock();
cs = task_cs(tsk);
- while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+ while (!cpumask_intersects(cs->effective_cpus, pmask))
cs = parent_cs(cs);
- if (unlikely(!cs)) {
- /*
- * The top cpuset doesn't have any online cpu as a
- * consequence of a race between cpuset_hotplug_work
- * and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to be
- * identical to cpu_online_mask.
- */
- goto out_unlock;
- }
- }
- cpumask_and(pmask, pmask, cs->effective_cpus);
-out_unlock:
+ cpumask_and(pmask, pmask, cs->effective_cpus);
rcu_read_unlock();
}
@@ -455,7 +452,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -464,119 +461,105 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
-/*
- * update task's spread flag if cpuset's page/slab spread flag is set
+/**
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
+ * Return: 0 if successful, -ENOMEM otherwise.
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
- struct task_struct *tsk)
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
{
- if (is_spread_page(cs))
- task_set_spread_page(tsk);
- else
- task_clear_spread_page(tsk);
-
- if (is_spread_slab(cs))
- task_set_spread_slab(tsk);
- else
- task_clear_spread_slab(tsk);
-}
+ int i;
-/*
- * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
- *
- * One cpuset is a subset of another if all its allowed CPUs and
- * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_rwsem.
- */
-
-static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
-{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
- nodes_subset(p->mems_allowed, q->mems_allowed) &&
- is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
- is_mem_exclusive(p) <= is_mem_exclusive(q);
+ for (i = 0; i < size; i++) {
+ if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+ while (--i >= 0)
+ free_cpumask_var(*pmasks[i]);
+ return -ENOMEM;
+ }
+ }
+ return 0;
}
/**
- * alloc_cpumasks - allocate three cpumasks for cpuset
- * @cs: the cpuset that have cpumasks to be allocated.
- * @tmp: the tmpmasks structure pointer
- * Return: 0 if successful, -ENOMEM otherwise.
- *
- * Only one of the two input arguments should be non-NULL.
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
*/
-static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3;
-
- if (cs) {
- pmask1 = &cs->cpus_allowed;
- pmask2 = &cs->effective_cpus;
- pmask3 = &cs->subparts_cpus;
- } else {
- pmask1 = &tmp->new_cpus;
- pmask2 = &tmp->addmask;
- pmask3 = &tmp->delmask;
- }
-
- if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
- goto free_one;
-
- if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
- goto free_two;
-
- return 0;
+ /*
+ * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+ * Note: Array size must match actual number of masks (3)
+ */
+ cpumask_var_t *pmask[3] = {
+ &tmp->new_cpus,
+ &tmp->addmask,
+ &tmp->delmask
+ };
-free_two:
- free_cpumask_var(*pmask2);
-free_one:
- free_cpumask_var(*pmask1);
- return -ENOMEM;
+ return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
}
/**
- * free_cpumasks - free cpumasks in a tmpmasks structure
- * @cs: the cpuset that have cpumasks to be free.
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
* @tmp: the tmpmasks structure pointer
*/
-static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline void free_tmpmasks(struct tmpmasks *tmp)
{
- if (cs) {
- free_cpumask_var(cs->cpus_allowed);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->subparts_cpus);
- }
- if (tmp) {
- free_cpumask_var(tmp->new_cpus);
- free_cpumask_var(tmp->addmask);
- free_cpumask_var(tmp->delmask);
- }
+ if (!tmp)
+ return;
+
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
}
/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
*/
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
- trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+ /* Allocate base structure */
+ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+ kzalloc(sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
- if (alloc_cpumasks(trial, NULL)) {
+ /* Setup cpumask pointer array */
+ cpumask_var_t *pmask[4] = {
+ &trial->cpus_allowed,
+ &trial->effective_cpus,
+ &trial->effective_xcpus,
+ &trial->exclusive_cpus
+ };
+
+ if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
kfree(trial);
return NULL;
}
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
- cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ /* Copy masks if duplicating */
+ if (cs) {
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ }
+
return trial;
}
@@ -586,10 +569,82 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumasks(cs, NULL);
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
kfree(cs);
}
+/* Return user specified exclusive CPUs */
+static inline struct cpumask *user_xcpus(struct cpuset *cs)
+{
+ return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
+ : cs->exclusive_cpus;
+}
+
+static inline bool xcpus_empty(struct cpuset *cs)
+{
+ return cpumask_empty(cs->cpus_allowed) &&
+ cpumask_empty(cs->exclusive_cpus);
+}
+
+/*
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
+ *
+ * Return true if exclusive, false if not
+ */
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
+{
+ struct cpumask *xcpus1 = user_xcpus(cs1);
+ struct cpumask *xcpus2 = user_xcpus(cs2);
+
+ if (cpumask_intersects(xcpus1, xcpus2))
+ return false;
+ return true;
+}
+
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ /* If either cpuset is exclusive, check if they are mutually exclusive */
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return !cpusets_are_exclusive(cs1, cs2);
+
+ /* Exclusive_cpus cannot intersect */
+ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+ return true;
+
+ /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+ if (!cpumask_empty(cs1->cpus_allowed) &&
+ cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+ return true;
+
+ if (!cpumask_empty(cs2->cpus_allowed) &&
+ cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ return true;
+
+ return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+ return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+ return false;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -597,7 +652,7 @@ static inline void free_cpuset(struct cpuset *cs)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_rwsem held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -614,42 +669,27 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
-
- /* The checks don't apply to root cpuset */
- if (cur == &top_cpuset)
- return 0;
+ int ret = 0;
rcu_read_lock();
- par = parent_cs(cur);
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode())
+ ret = cpuset1_validate_change(cur, trial);
+ if (ret)
goto out;
- /*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
+ /* Remaining checks don't apply to root cpuset */
+ if (cur == &top_cpuset)
+ goto out;
+
+ par = parent_cs(cur);
/*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (cpuset_is_populated(cur)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -660,14 +700,40 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
- * tasks.
+ * tasks. This check is not done when scheduling is disabled as the
+ * users should know what they are doing.
+ *
+ * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
+ * cpus_allowed.
+ *
+ * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
+ * for non-isolated partition root. At this point, the target
+ * effective_cpus isn't computed yet. user_xcpus() is the best
+ * approximation.
+ *
+ * TBD: May need to precompute the real effective_cpus here in case
+ * incorrect scheduling of SCHED_DEADLINE tasks in a partition
+ * becomes an issue.
*/
ret = -EBUSY;
- if (is_cpu_exclusive(cur) &&
- !cpuset_cpumask_can_shrink(cur->cpus_allowed,
- trial->cpus_allowed))
+ if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
+ !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap. exclusive_cpus cannot overlap with each other if set.
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if (c == cur)
+ continue;
+ if (cpus_excl_conflict(trial, c))
+ goto out;
+ if (mems_excl_conflict(trial, c))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -712,7 +778,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_rwsem held. */
+/* Must be called with cpuset_mutex held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -738,7 +804,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_rwsem held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -759,18 +825,15 @@ static inline int nr_cpusets(void)
* were changed (added or removed.)
*
* Finding the best partition (set of domains):
- * The triple nested loops below over i, j, k scan over the
- * load balanced cpusets (using the array of cpuset pointers in
- * csa[]) looking for pairs of cpusets that have overlapping
- * cpus_allowed, but which don't have the same 'pn' partition
- * number and gives them in the same partition number. It keeps
- * looping on the 'restart' label until it can no longer find
- * any such pairs.
- *
- * The union of the cpus_allowed masks from the set of
- * all cpusets having the same 'pn' value then form the one
- * element of the partition (one sched domain) to be passed to
- * partition_sched_domains().
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
@@ -778,20 +841,23 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
- int i, j, k; /* indices for partition finding loops */
+ int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cpuset_v2();
+ int nslot_update;
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -803,7 +869,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -819,64 +885,81 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
+
+ if (cgrpv2)
+ goto v2;
+
/*
+ * v1:
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
- *
- * If root is load-balancing, we can skip @cp if it
- * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
- continue;
-
- if (root_load_balance &&
- cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
for (i = 0; i < csn; i++)
- csa[i]->pn = i;
- ndoms = csn;
+ uf_node_init(&csa[i]->node);
-restart:
- /* Find the best partition (set of sched domains) */
+ /* Merge overlapping cpusets */
for (i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- int apn = a->pn;
-
- for (j = 0; j < csn; j++) {
- struct cpuset *b = csa[j];
- int bpn = b->pn;
-
- if (apn != bpn && cpusets_overlap(a, b)) {
- for (k = 0; k < csn; k++) {
- struct cpuset *c = csa[k];
-
- if (c->pn == bpn)
- c->pn = apn;
- }
- ndoms--; /* one less element */
- goto restart;
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j])) {
+ /*
+ * Cgroup v2 shouldn't pass down overlapping
+ * partition root cpusets.
+ */
+ WARN_ON_ONCE(cgrpv2);
+ uf_union(&csa[i]->node, &csa[j]->node);
}
}
}
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
/*
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
@@ -892,45 +975,48 @@ restart:
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
- for (nslot = 0, i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- struct cpumask *dp;
- int apn = a->pn;
-
- if (apn < 0) {
- /* Skip completed partitions */
- continue;
- }
-
- dp = doms[nslot];
-
- if (nslot == ndoms) {
- static int warnings = 10;
- if (warnings) {
- pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
- nslot, ndoms, csn, i, apn);
- warnings--;
- }
- continue;
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (csa[i] == &top_cpuset)
+ cpumask_and(doms[i], csa[i]->effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
+ goto done;
+ }
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
for (j = i; j < csn; j++) {
- struct cpuset *b = csa[j];
-
- if (apn == b->pn) {
- cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
- update_domain_attr_tree(dattr + nslot, b);
-
- /* Done with this partition */
- b->pn = -1;
+ update_domain_attr_tree(dattr + nslot, csa[j]);
}
}
- nslot++;
+ if (nslot_update)
+ nslot++;
}
BUG_ON(nslot != ndoms);
@@ -949,11 +1035,14 @@ done:
return ndoms;
}
-static void update_tasks_root_domain(struct cpuset *cs)
+static void dl_update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ if (cs->nr_deadline_tasks == 0)
+ return;
+
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
@@ -962,22 +1051,25 @@ static void update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void rebuild_root_domains(void)
+void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
rcu_read_lock();
- /*
- * Clear default root domain DL accounting, it will be computed again
- * if a task belongs to it.
- */
- dl_clear_root_domain(&def_root_domain);
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
@@ -990,7 +1082,7 @@ static void rebuild_root_domains(void)
rcu_read_unlock();
- update_tasks_root_domain(cs);
+ dl_update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
@@ -998,16 +1090,6 @@ static void rebuild_root_domains(void)
rcu_read_unlock();
}
-static void
-partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- rebuild_root_domains();
- mutex_unlock(&sched_domains_mutex);
-}
-
/*
* Rebuild scheduler domains.
*
@@ -1017,9 +1099,9 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_rwsem held. Takes cpus_read_lock().
+ * Call with cpuset_mutex held. Takes cpus_read_lock().
*/
-static void rebuild_sched_domains_locked(void)
+void rebuild_sched_domains_locked(void)
{
struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
@@ -1028,18 +1110,19 @@ static void rebuild_sched_domains_locked(void)
int ndoms;
lockdep_assert_cpus_held();
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
+ force_sd_rebuild = false;
/*
* If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
* is enough to detect racing CPU offlines.
*/
- if (!top_cpuset.nr_subparts_cpus &&
+ if (cpumask_empty(subpartitions_cpus) &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
@@ -1048,10 +1131,10 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts_cpus) {
+ if (!cpumask_empty(subpartitions_cpus)) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1068,39 +1151,75 @@ static void rebuild_sched_domains_locked(void)
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_and_rebuild_sched_domains(ndoms, doms, attr);
+ partition_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */
+static void rebuild_sched_domains_cpuslocked(void)
+{
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
+}
+
void rebuild_sched_domains(void)
{
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
- rebuild_sched_domains_locked();
- percpu_up_write(&cpuset_rwsem);
+ rebuild_sched_domains_cpuslocked();
cpus_read_unlock();
}
+void cpuset_reset_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ partition_sched_domains(1, NULL, NULL);
+ mutex_unlock(&cpuset_mutex);
+}
+
/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable.
+ *
+ * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus
+ * to make sure all offline CPUs are also included as hotplug code won't
+ * update cpumasks for tasks in top_cpuset.
+ *
+ * As task_cpu_possible_mask() can be task dependent in arm64, we have to
+ * do cpu masking per task instead of doing it once for all.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+ while ((task = css_task_iter_next(&it))) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(task);
+
+ if (top_cs) {
+ /*
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
+ */
+ if (task->flags & PF_NO_SETAFFINITY)
+ continue;
+ cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
+ } else {
+ cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
+ }
+ set_cpus_allowed_ptr(task, new_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1110,384 +1229,1137 @@ static void update_tasks_cpumask(struct cpuset *cs)
* @cs: the cpuset the need to recompute the new effective_cpus mask
* @parent: the parent cpuset
*
- * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask. Since offlined
- * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
- * to mask those out.
+ * The result is valid only if the given cpuset isn't a partition root.
*/
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus) {
- cpumask_or(new_cpus, parent->effective_cpus,
- parent->subparts_cpus);
- cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
- cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+}
+
+/*
+ * Commands for update_parent_effective_cpumask
+ */
+enum partition_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's effective_cpus */
+ partcmd_invalidate, /* Make partition invalid */
+};
+
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > PRS_MEMBER);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+ bool new_lb;
+
+ /*
+ * If cs is not a valid partition root, the load balance state
+ * will follow its parent.
+ */
+ if (new_prs > 0) {
+ new_lb = (new_prs != PRS_ISOLATED);
} else {
- cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ new_lb = is_sched_load_balance(parent_cs(cs));
}
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+ if (rebuild_domains)
+ cpuset_force_rebuild();
}
/*
- * Commands for update_parent_subparts_cpumask
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
*/
-enum subparts_cmd {
- partcmd_enable, /* Enable partition root */
- partcmd_disable, /* Disable partition root */
- partcmd_update, /* Update parent's subparts_cpus */
-};
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ /*
+ * A populated partition (cs or parent) can't have empty effective_cpus
+ */
+ return (cpumask_subset(parent->effective_cpus, xcpus) &&
+ partition_is_populated(parent, cs)) ||
+ (!cpumask_intersects(xcpus, cpu_active_mask) &&
+ partition_is_populated(cs, NULL));
+}
+
+static void reset_partition_data(struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!cpuset_v2())
+ return;
+
+ lockdep_assert_held(&callback_lock);
+
+ if (cpumask_empty(cs->exclusive_cpus)) {
+ cpumask_clear(cs->effective_xcpus);
+ if (is_cpu_exclusive(cs))
+ clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ }
+ if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+}
+
+/*
+ * isolated_cpus_update - Update the isolated_cpus mask
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+
+ isolated_cpus_updating = true;
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ if (new_prs != parent->partition_root_state)
+ isolated_cpus_update(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ if (old_prs != parent->partition_root_state)
+ isolated_cpus_update(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
+/*
+ * isolated_cpus_can_update - check for isolated & nohz_full conflicts
+ * @add_cpus: cpu mask for cpus that are going to be isolated
+ * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL
+ * Return: false if there is conflict, true otherwise
+ *
+ * If nohz_full is enabled and we have isolated CPUs, their combination must
+ * still leave housekeeping CPUs.
+ *
+ * TBD: Should consider merging this function into
+ * prstate_housekeeping_conflict().
+ */
+static bool isolated_cpus_can_update(struct cpumask *add_cpus,
+ struct cpumask *del_cpus)
+{
+ cpumask_var_t full_hk_cpus;
+ int res = true;
+
+ if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
+ return true;
+
+ if (del_cpus && cpumask_weight_and(del_cpus,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
+ return true;
+
+ if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))
+ return false;
+
+ cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);
+ cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);
+ if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))
+ res = false;
+
+ free_cpumask_var(full_hk_cpus);
+ return res;
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+ if (!have_boot_isolcpus)
+ return false;
+
+ if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ return true;
+
+ return false;
+}
+
+/*
+ * update_isolation_cpumasks - Update external isolation related CPU masks
+ *
+ * The following external CPU masks will be updated if necessary:
+ * - workqueue unbound cpumask
+ */
+static void update_isolation_cpumasks(void)
+{
+ int ret;
+
+ if (!isolated_cpus_updating)
+ return;
+
+ lockdep_assert_cpus_held();
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+
+ ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+
+ isolated_cpus_updating = false;
+}
/**
- * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
- * @cpuset: The cpuset that requests change in partition root state
- * @cmd: Partition root state change command
- * @newmask: Optional new cpumask for partcmd_update
- * @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
- *
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The cpus_allowed mask of the given cpuset will
- * be put into parent's subparts_cpus and taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * cpus_allowed can be granted or an error code will be returned.
- *
- * For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. Any CPUs in cpus_allowed that are in
- * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
- *
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
- *
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
- */
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
- struct cpumask *newmask,
- struct tmpmasks *tmp)
-{
- struct cpuset *parent = parent_cs(cpuset);
- int adding; /* Moving cpus from effective_cpus to subparts_cpus */
- int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
- int old_prs, new_prs;
- bool part_error = false; /* Partition error? */
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
- percpu_rwsem_assert_held(&cpuset_rwsem);
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus: exclusive effective CPU mask to modify
+ *
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
+ */
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *excpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *sibling;
+ int retval = 0;
+
+ if (cpumask_empty(excpus))
+ return retval;
/*
- * The parent must be a partition root.
- * The new cpumask, if present, or the current cpus_allowed must
- * not be empty.
+ * Exclude exclusive CPUs from siblings
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, css, parent) {
+ if (sibling == cs)
+ continue;
+
+ if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
+ retval++;
+ continue;
+ }
+ if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ retval++;
+ }
+ }
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+ cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(trialcs);
+ struct cpumask *excpus = trialcs->effective_xcpus;
+
+ /* trialcs is member, cpuset.cpus has no impact to excpus */
+ if (cs_is_member(cs))
+ cpumask_and(excpus, trialcs->exclusive_cpus,
+ parent->effective_xcpus);
+ else
+ cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+ return cs->remote_partition;
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+ return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @new_prs: new partition_root_state
+ * @tmp: temporary masks
+ * Return: 0 if successful, errcode if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
+{
/*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
+ * The user must have sysadmin privilege.
*/
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
+ if (!capable(CAP_SYS_ADMIN))
+ return PERR_ACCESS;
/*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
+ * The requested exclusive_cpus must not be allocated to other
+ * partitions and it can't use up all the root's effective_cpus.
+ *
+ * The effective_xcpus mask can contain offline CPUs, but there must
+ * be at least one or more online CPUs present before it can be enabled.
+ *
+ * Note that creating a remote partition with any local partition root
+ * above it or remote partition root underneath it is not allowed.
*/
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ compute_excpus(cs, tmp->new_cpus);
+ WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
+ if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+ return PERR_INVCPUS;
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->new_cpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, tmp->new_cpus))
+ return PERR_HKEEPING;
+
+ spin_lock_irq(&callback_lock);
+ partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ cs->remote_partition = true;
+ cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ cpuset_force_rebuild();
+ cs->prs_err = 0;
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return 0;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temporary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ WARN_ON_ONCE(!is_remote_partition(cs));
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ spin_lock_irq(&callback_lock);
+ cs->remote_partition = false;
+ partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);
+ if (cs->prs_err)
+ cs->partition_root_state = -cs->partition_root_state;
+ else
+ cs->partition_root_state = PRS_MEMBER;
+ /* effective_xcpus may need to be changed */
+ compute_excpus(cs, cs->effective_xcpus);
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ cpuset_force_rebuild();
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @xcpus: the new exclusive_cpus mask, if non-NULL
+ * @excpus: the new effective_xcpus mask
+ * @tmp: temporary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
+ struct cpumask *excpus, struct tmpmasks *tmp)
+{
+ bool adding, deleting;
+ int prs = cs->partition_root_state;
+
+ if (WARN_ON_ONCE(!is_remote_partition(cs)))
+ return;
+
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ if (cpumask_empty(excpus)) {
+ cs->prs_err = PERR_CPUSEMPTY;
+ goto invalidate;
+ }
+
+ adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
+
+ /*
+ * Additions of remote CPUs is only allowed if those CPUs are
+ * not allocated to other partitions and there are effective_cpus
+ * left in the top cpuset.
+ */
+ if (adding) {
+ WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
+ if (!capable(CAP_SYS_ADMIN))
+ cs->prs_err = PERR_ACCESS;
+ else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
+ cs->prs_err = PERR_NOCPUS;
+ else if ((prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ cs->prs_err = PERR_HKEEPING;
+ if (cs->prs_err)
+ goto invalidate;
+ }
+
+ spin_lock_irq(&callback_lock);
+ if (adding)
+ partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ partition_xcpus_del(prs, NULL, tmp->delmask);
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * Need to update effective_xcpus and exclusive_cpus now as
+ * update_sibling_cpumasks() below may iterate back to the same cs.
+ */
+ cpumask_copy(cs->effective_xcpus, excpus);
+ if (xcpus)
+ cpumask_copy(cs->exclusive_cpus, xcpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ if (adding || deleting)
+ cpuset_force_rebuild();
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return;
+
+invalidate:
+ remote_partition_disable(cs, tmp);
+}
+
+/**
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
+ * @cs: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0 or a partition root state error code
+ *
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transformed from a partition
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
+ *
+ * For partcmd_invalidate, the current partition will be made invalid.
+ *
+ * The partcmd_enable* and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_err will be updated
+ * directly.
+ */
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cs);
+ int adding; /* Adding cpus to parent's effective_cpus */
+ int deleting; /* Deleting cpus from parent's effective_cpus */
+ int old_prs, new_prs;
+ int part_error = PERR_NONE; /* Partition error? */
+ struct cpumask *xcpus = user_xcpus(cs);
+ int parent_prs = parent->partition_root_state;
+ bool nocpu;
+
+ lockdep_assert_held(&cpuset_mutex);
+ WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
+
+ /*
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
- old_prs = new_prs = cpuset->partition_root_state;
- if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
- adding = true;
+ old_prs = new_prs = cs->partition_root_state;
+
+ if (cmd == partcmd_invalidate) {
+ if (is_partition_invalid(cs))
+ return 0;
+
+ /*
+ * Make the current partition invalid.
+ */
+ if (is_partition_valid(parent))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ if (old_prs > 0)
+ new_prs = -old_prs;
+
+ goto write_error;
+ }
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if (!newmask && xcpus_empty(cs))
+ return PERR_CPUSEMPTY;
+
+ nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
+ /*
+ * Need to call compute_excpus() in case
+ * exclusive_cpus not set. Sibling conflict should only happen
+ * if exclusive_cpus isn't set.
+ */
+ xcpus = tmp->delmask;
+ if (compute_excpus(cs, xcpus))
+ WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
+
+ /*
+ * Enabling partition root is not allowed if its
+ * effective_xcpus is empty.
+ */
+ if (cpumask_empty(xcpus))
+ return PERR_INVCPUS;
+
+ if (prstate_housekeeping_conflict(new_prs, xcpus))
+ return PERR_HKEEPING;
+
+ if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&
+ !isolated_cpus_can_update(xcpus, NULL))
+ return PERR_HKEEPING;
+
+ if (tasks_nocpu_error(parent, cs, xcpus))
+ return PERR_NOCPUS;
+
+ /*
+ * This function will only be called when all the preliminary
+ * checks have passed. At this point, the following condition
+ * should hold.
+ *
+ * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
+ *
+ * Warn if it is not the case.
+ */
+ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+
+ deleting = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
+ /*
+ * May need to add cpus back to parent's effective_cpus
+ * (and maybe removed from subpartitions_cpus/isolated_cpus)
+ * for valid partition root. xcpus may contain CPUs that
+ * shouldn't be removed from the two global cpumasks.
+ */
+ if (is_partition_valid(cs)) {
+ cpumask_copy(tmp->addmask, cs->effective_xcpus);
+ adding = true;
+ }
+ new_prs = PRS_MEMBER;
} else if (newmask) {
/*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ goto write_error;
+ }
+
+ /* Check newmask again, whether cpus are available for parent/cs */
+ nocpu |= tasks_nocpu_error(parent, cs, newmask);
+
+ /*
* partcmd_update with newmask:
*
- * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
- * & ~parent->subparts_cpus
+ * Compute add/delete mask to/from effective_cpus
+ *
+ * For valid partition:
+ * addmask = exclusive_cpus & ~newmask
+ * & parent->effective_xcpus
+ * delmask = newmask & ~exclusive_cpus
+ * & parent->effective_xcpus
+ *
+ * For invalid partition:
+ * delmask = newmask & parent->effective_xcpus
+ * The partition may become valid soon.
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
- deleting = cpumask_and(tmp->delmask, tmp->delmask,
- parent->subparts_cpus);
+ if (is_partition_invalid(cs)) {
+ adding = false;
+ deleting = cpumask_and(tmp->delmask,
+ newmask, parent->effective_xcpus);
+ } else {
+ cpumask_andnot(tmp->addmask, xcpus, newmask);
+ adding = cpumask_and(tmp->addmask, tmp->addmask,
+ parent->effective_xcpus);
+
+ cpumask_andnot(tmp->delmask, newmask, xcpus);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->effective_xcpus);
+ }
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
- adding = cpumask_andnot(tmp->addmask, tmp->addmask,
- parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * TBD: Invalidate a currently valid child root partition may
+ * still break isolated_cpus_can_update() rule if parent is an
+ * isolated partition.
*/
- if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ if (is_partition_valid(cs) && (old_prs != parent_prs)) {
+ if ((parent_prs == PRS_ROOT) &&
+ /* Adding to parent means removing isolated CPUs */
+ !isolated_cpus_can_update(tmp->delmask, tmp->addmask))
+ part_error = PERR_HKEEPING;
+ if ((parent_prs == PRS_ISOLATED) &&
+ /* Adding to parent means adding isolated CPUs */
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ part_error = PERR_HKEEPING;
+ }
+
+ /*
+ * The new CPUs to be removed from parent's effective CPUs
+ * must be present.
+ */
+ if (deleting) {
+ cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+ }
+
+ /*
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
+ */
+ if (nocpu && (!adding ||
+ !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
+ part_error = PERR_NOCPUS;
+ deleting = false;
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
}
} else {
/*
- * partcmd_update w/o newmask:
+ * partcmd_update w/o newmask
*
- * addmask = cpus_allowed & parent->effective_cpus
+ * delmask = effective_xcpus & parent->effective_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This can be called from:
+ * 1) update_cpumasks_hier()
+ * 2) cpuset_hotplug_update_tasks()
+ *
+ * Check to see if it can be transitioned from valid to
+ * invalid partition or vice versa.
+ *
+ * A partition error happens when parent has tasks and all
+ * its effective CPUs will have to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ if (nocpu) {
+ part_error = PERR_NOCPUS;
+ if (is_partition_valid(cs))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
+ cpumask_subset(xcpus, parent->effective_xcpus)) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool exclusive = true;
+
+ /*
+ * Convert invalid partition to valid has to
+ * pass the cpu exclusivity test.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, parent) {
+ if (child == cs)
+ continue;
+ if (!cpusets_are_exclusive(cs, child)) {
+ exclusive = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (exclusive)
+ deleting = cpumask_and(tmp->delmask,
+ xcpus, parent->effective_cpus);
+ else
+ part_error = PERR_NOTEXCL;
+ }
}
- if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
+write_error:
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
+ if (cmd == partcmd_update) {
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- new_prs = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- new_prs = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
- }
-
- if (!part_error && (new_prs == PRS_ERROR))
- return 0; /* Nothing need to be done */
-
- if (new_prs == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
}
if (!adding && !deleting && (new_prs == old_prs))
return 0;
/*
- * Change the parent's subparts_cpus.
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+ * validate_change() has already been successfully called and
+ * CPU lists in cs haven't been updated yet. So defer it to later.
+ */
+ if ((old_prs != new_prs) && (cmd != partcmd_update)) {
+ int err = update_partition_exclusive_flag(cs, new_prs);
+
+ if (err)
+ return err;
+ }
+
+ /*
+ * Change the parent's effective_cpus & effective_xcpus (top cpuset
+ * only).
+ *
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (adding) {
- cpumask_or(parent->subparts_cpus,
- parent->subparts_cpus, tmp->addmask);
- cpumask_andnot(parent->effective_cpus,
- parent->effective_cpus, tmp->addmask);
- }
- if (deleting) {
- cpumask_andnot(parent->subparts_cpus,
- parent->subparts_cpus, tmp->delmask);
- /*
- * Some of the CPUs in subparts_cpus might have been offlined.
- */
- cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
- cpumask_or(parent->effective_cpus,
- parent->effective_cpus, tmp->delmask);
+ if (old_prs != new_prs)
+ cs->partition_root_state = new_prs;
+
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ partition_xcpus_del(old_prs, parent, tmp->addmask);
+ if (deleting)
+ partition_xcpus_add(new_prs, parent, tmp->delmask);
+
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+
+ if ((old_prs != new_prs) && (cmd == partcmd_update))
+ update_partition_exclusive_flag(cs, new_prs);
+
+ if (adding || deleting) {
+ cpuset_update_tasks_cpumask(parent, tmp->addmask);
+ update_sibling_cpumasks(parent, cs, tmp);
}
- parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+ /*
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_handle_hotplug(). Update the load balance flag and
+ * scheduling domain accordingly.
+ */
+ if ((cmd == partcmd_update) && !newmask)
+ update_partition_sd_lb(cs, old_prs);
- if (old_prs != new_prs)
- cpuset->partition_root_state = new_prs;
+ notify_partition_change(cs, old_prs);
+ return 0;
+}
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cpuset, old_prs, new_prs);
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and excluding their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+ struct cpumask *new_ecpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool populated = partition_is_populated(cs, NULL);
+
+ /*
+ * Check child partition roots to see if they should be
+ * invalidated when
+ * 1) child effective_xcpus not a subset of new
+ * excluisve_cpus
+ * 2) All the effective_cpus will be used up and cp
+ * has tasks
+ */
+ compute_excpus(cs, new_ecpus);
+ cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
- return cmd == partcmd_update;
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (!is_partition_valid(child))
+ continue;
+
+ /*
+ * There shouldn't be a remote partition underneath another
+ * partition root.
+ */
+ WARN_ON_ONCE(is_remote_partition(child));
+ child->prs_err = 0;
+ if (!cpumask_subset(child->effective_xcpus,
+ cs->effective_xcpus))
+ child->prs_err = PERR_INVCPUS;
+ else if (populated &&
+ cpumask_subset(new_ecpus, child->effective_xcpus))
+ child->prs_err = PERR_NOCPUS;
+
+ if (child->prs_err) {
+ int old_prs = child->partition_root_state;
+
+ /*
+ * Invalidate child partition
+ */
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(child);
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(child, old_prs);
+ continue;
+ }
+ cpumask_andnot(new_ecpus, new_ecpus,
+ child->effective_xcpus);
+ }
+ rcu_read_unlock();
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
* When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
- bool need_rebuild_sched_domains = false;
int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool remote = is_remote_partition(cp);
+ bool update_parent = false;
- compute_effective_cpumask(tmp->new_cpus, cp, parent);
+ old_prs = new_prs = cp->partition_root_state;
/*
- * If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * For child remote partition root (!= cs), we need to call
+ * remote_cpus_update() if effective_xcpus will be changed.
+ * Otherwise, we can skip the whole subtree.
+ *
+ * remote_cpus_update() will reuse tmp->new_cpus only after
+ * its value is being processed.
*/
- if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
- cpumask_copy(tmp->new_cpus, parent->effective_cpus);
- if (!cp->use_parent_ecpus) {
- cp->use_parent_ecpus = true;
- parent->child_ecpus_count++;
+ if (remote && (cp != cs)) {
+ compute_excpus(cp, tmp->new_cpus);
+ if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
- } else if (cp->use_parent_ecpus) {
- cp->use_parent_ecpus = false;
- WARN_ON_ONCE(!parent->child_ecpus_count);
- parent->child_ecpus_count--;
+ rcu_read_unlock();
+ remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
+ rcu_read_lock();
+
+ /* Remote partition may be invalidated */
+ new_prs = cp->partition_root_state;
+ remote = (new_prs == old_prs);
+ }
+
+ if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
+ compute_partition_effective_cpumask(cp, tmp->new_cpus);
+ else
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+ if (remote)
+ goto get_css; /* Ready to update cpuset data */
+
+ /*
+ * A partition with no effective_cpus is allowed as long as
+ * there is no task associated with it. Call
+ * update_parent_effective_cpumask() to check it.
+ */
+ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+ update_parent = true;
+ goto update_parent_effective;
}
/*
- * Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
- if (!cp->partition_root_state &&
- cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+
+ /*
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) force flag not set, and
+ * 4) for v2 load balance state same as its parent.
+ */
+ if (!cp->partition_root_state && !force &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cpuset_v2() ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_effective:
/*
- * update_parent_subparts_cpumask() should have been called
+ * update_parent_effective_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
- * update_tasks_cpumask() again for tasks in the parent
- * cpuset if the parent's subparts_cpus changes.
+ * cpuset_update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's effective_cpus changes.
*/
- old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear its state
- * and its CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- new_prs = PRS_DISABLED;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
- break;
-
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- new_prs = PRS_ERROR;
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
-
+get_css:
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
-
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ if (update_parent) {
+ update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
/*
- * Make sure that effective_cpus & subparts_cpus
- * are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
*/
- cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
- cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
+ new_prs = cp->partition_root_state;
}
- if (new_prs != old_prs)
- cp->partition_root_state = new_prs;
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cp->partition_root_state = new_prs;
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
+ compute_excpus(cp, cp->effective_xcpus);
+ /*
+ * Make sure effective_xcpus is properly set for a valid
+ * partition root.
+ */
+ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
+ cpumask_and(cp->effective_xcpus,
+ cp->cpus_allowed, parent->effective_xcpus);
+ else if (new_prs < 0)
+ reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
- notify_partition_change(cp, old_prs, new_prs);
+
+ notify_partition_change(cp, old_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
+
+ /*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cpuset_v2() && !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -1497,17 +2369,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
*/
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
- need_rebuild_sched_domains = true;
+ (!cpuset_v2() || is_partition_valid(cp)))
+ cpuset_force_rebuild();
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
-
- if (need_rebuild_sched_domains)
- rebuild_sched_domains_locked();
}
/**
@@ -1522,23 +2390,165 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ lockdep_assert_held(&cpuset_mutex);
+
/*
* Check all its siblings and call update_cpumasks_hier()
- * if their use_parent_ecpus flag is set in order for them
- * to use the right effective_cpus value.
+ * if their effective_cpus will need to be changed.
+ *
+ * It is possible a change in parent's effective_cpus
+ * due to a change in a child partition's effective_xcpus will impact
+ * its siblings even if they do not inherit parent's effective_cpus
+ * directly.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs)
continue;
- if (!sibling->use_parent_ecpus)
+ if (!is_partition_valid(sibling)) {
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+ continue;
+ } else if (is_remote_partition(sibling)) {
+ /*
+ * Change in a sibling cpuset won't affect a remote
+ * partition root.
+ */
continue;
+ }
- update_cpumasks_hier(sibling, tmp);
+ if (!css_tryget_online(&sibling->css))
+ continue;
+
+ rcu_read_unlock();
+ update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
+{
+ int retval;
+
+ retval = cpulist_parse(buf, out_mask);
+ if (retval < 0)
+ return retval;
+ if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (cs_is_member(trialcs))
+ return PERR_NONE;
+
+ if (cpumask_empty(trialcs->effective_xcpus))
+ return PERR_INVCPUS;
+
+ if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+ trialcs->effective_xcpus))
+ return PERR_HKEEPING;
+
+ if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+ return PERR_NOCPUS;
+
+ return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ int retval;
+ struct cpuset *parent = parent_cs(cs);
+
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cpuset_v2()) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *cp;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ trialcs->prs_err = PERR_NOTEXCL;
+ rcu_read_lock();
+ cpuset_for_each_child(cp, css, parent) {
+ struct cpumask *xcpus = user_xcpus(trialcs);
+
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(xcpus, cp->effective_xcpus)) {
+ rcu_read_unlock();
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ return retval;
+}
+
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ enum prs_errcode prs_err;
+
+ if (cs_is_member(cs))
+ return;
+
+ prs_err = validate_partition(cs, trialcs);
+ if (prs_err)
+ trialcs->prs_err = cs->prs_err = prs_err;
+
+ if (is_remote_partition(cs)) {
+ if (trialcs->prs_err)
+ remote_partition_disable(cs, tmp);
+ else
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, tmp);
+ } else {
+ if (trialcs->prs_err)
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, tmp);
+ else
+ update_parent_effective_cpumask(cs, partcmd_update,
+ trialcs->effective_xcpus, tmp);
+ }
+}
+
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
@@ -1550,81 +2560,120 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
- if (cs == &top_cpuset)
- return -EACCES;
-
- /*
- * An empty cpus_allowed is ok only if the cpuset has no tasks.
- * Since cpulist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have cpus.
- */
- if (!*buf) {
- cpumask_clear(trialcs->cpus_allowed);
- } else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
- if (retval < 0)
- return retval;
-
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
- return -EINVAL;
- }
+ retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ compute_trialcs_excpus(trialcs, cs);
+ trialcs->prs_err = PERR_NONE;
+
+ retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
if (retval < 0)
- return retval;
+ goto out_free;
-#ifdef CONFIG_CPUMASK_OFFSTACK
/*
- * Use the cpumasks in trialcs for tmpmasks when they are pointers
- * to allocated cpumasks.
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
*/
- tmp.addmask = trialcs->subparts_cpus;
- tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = trialcs->cpus_allowed;
-#endif
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
- if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
- }
+ partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /* effective_cpus/effective_xcpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, force);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+out_free:
+ free_tmpmasks(&tmp);
+ return retval;
+}
+
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the CPUs didn't change */
+ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+ return 0;
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
*/
- if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
- }
+ if (compute_trialcs_excpus(trialcs, cs))
+ return -EINVAL;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ retval = validate_change(cs, trialcs);
+ if (retval)
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ trialcs->prs_err = PERR_NONE;
+ partition_cpus_change(cs, trialcs, &tmp);
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_cpumasks_hier(cs, &tmp);
+ /*
+ * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+ * of the subtree when it is a valid partition root or effective_xcpus
+ * is updated.
+ */
+ if (is_partition_valid(cs) || force)
+ update_cpumasks_hier(cs, &tmp, force);
- if (cs->partition_root_state) {
- struct cpuset *parent = parent_cs(cs);
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
- /*
- * For partition root, update the cpumasks of sibling
- * cpusets if they use parent's effective_cpus.
- */
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
- }
+ free_tmpmasks(&tmp);
return 0;
}
@@ -1676,9 +2725,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
}
/*
@@ -1712,16 +2776,16 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
static void *cpuset_being_rebound;
/**
- * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs)
+void cpuset_update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_rwsem */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
struct css_task_iter it;
struct task_struct *task;
@@ -1734,7 +2798,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_rwsem, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1780,7 +2844,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1817,7 +2881,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
- update_tasks_nodemask(cp);
+ cpuset_update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
@@ -1833,7 +2897,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_rwsem held. May take callback_lock during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1844,41 +2908,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset) {
- retval = -EACCES;
- goto done;
- }
-
- /*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
+ * The validate_change() call ensures that cpusets with tasks have memory.
*/
- if (!*buf) {
- nodes_clear(trialcs->mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs->mems_allowed);
- if (retval < 0)
- goto done;
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ return retval;
- if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
- }
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed))
+ return -EINVAL;
+
+ /* No change? nothing to do */
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))
+ return 0;
- if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
- retval = 0; /* Too easy - nothing to do */
- goto done;
- }
retval = validate_change(cs, trialcs);
if (retval < 0)
- goto done;
+ return retval;
check_insane_mems_config(&trialcs->mems_allowed);
@@ -1888,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &trialcs->mems_allowed);
-done:
- return retval;
+ return 0;
}
bool current_cpuset_is_being_rebound(void)
@@ -1903,52 +2949,16 @@ bool current_cpuset_is_being_rebound(void)
return ret;
}
-static int update_relax_domain_level(struct cpuset *cs, s64 val)
-{
-#ifdef CONFIG_SMP
- if (val < -1 || val >= sched_domain_level_max)
- return -EINVAL;
-#endif
-
- if (val != cs->relax_domain_level) {
- cs->relax_domain_level = val;
- if (!cpumask_empty(cs->cpus_allowed) &&
- is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
- }
-
- return 0;
-}
-
-/**
- * update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
- *
- * Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_rwsem held, cpuset membership stays
- * stable.
- */
-static void update_tasks_flags(struct cpuset *cs)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
- css_task_iter_end(&it);
-}
-
/*
- * update_flag - read a 0 or a 1 in a file and update associated flag
+ * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
@@ -1956,7 +2966,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int spread_flag_changed;
int err;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs)
return -ENOMEM;
@@ -1979,303 +2989,340 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- rebuild_sched_domains_locked();
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+ if (cpuset_v2())
+ cpuset_force_rebuild();
+ else
+ rebuild_sched_domains_locked();
+ }
if (spread_flag_changed)
- update_tasks_flags(cs);
+ cpuset1_update_tasks_flags(cs);
out:
free_cpuset(trialcs);
return err;
}
-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * new_prs: new partition root state
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err, old_prs = cs->partition_root_state;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
+ bool isolcpus_updated = false;
if (old_prs == new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * Treat a previously invalid partition root as if it is a "member".
*/
- if (new_prs && (old_prs == PRS_ERROR))
- return -EINVAL;
+ if (new_prs && is_partition_invalid(cs))
+ old_prs = PRS_MEMBER;
- if (alloc_cpumasks(NULL, &tmpmask))
+ if (alloc_tmpmasks(&tmpmask))
return -ENOMEM;
- err = -EINVAL;
+ err = update_partition_exclusive_flag(cs, new_prs);
+ if (err)
+ goto out;
+
if (!old_prs) {
/*
- * Turning on partition root requires setting the
- * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cpus_allowed and exclusive_cpus cannot be both empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (xcpus_empty(cs)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }
- err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ /*
+ * We don't support the creation of a new local partition with
+ * a remote partition underneath it. This unsupported
+ * setting can happen only if parent is the top_cpuset because
+ * a remote partition cannot be created underneath an existing
+ * local or remote partition.
+ */
+ if ((parent == &top_cpuset) &&
+ cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
+ err = PERR_REMOTE;
goto out;
+ }
- err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmpmask);
- if (err) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- goto out;
+ /*
+ * If parent is valid partition, enable local partiion.
+ * Otherwise, enable a remote partition.
+ */
+ if (is_partition_valid(parent)) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
+ } else {
+ err = remote_partition_enable(cs, new_prs, &tmpmask);
}
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ * Need to update isolated_cpus.
+ */
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))
+ err = PERR_HKEEPING;
+ else
+ isolcpus_updated = true;
} else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
*/
- if (old_prs == PRS_ERROR) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
- goto out;
- }
-
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmpmask);
- if (err)
- goto out;
+ if (is_remote_partition(cs))
+ remote_partition_disable(cs, &tmpmask);
+ else
+ update_parent_effective_cpumask(cs, partcmd_disable,
+ NULL, &tmpmask);
- /* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ /*
+ * Invalidation of child partitions will be done in
+ * update_cpumasks_hier().
+ */
}
-
+out:
/*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
*/
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
-
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmpmask);
-
- rebuild_sched_domains_locked();
-out:
- if (!err) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = new_prs;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, new_prs);
+ if (err) {
+ new_prs = -new_prs;
+ update_partition_exclusive_flag(cs, new_prs);
}
- free_cpumasks(NULL, &tmpmask);
- return err;
-}
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
+ if (!is_partition_valid(cs))
+ reset_partition_data(cs);
+ else if (isolcpus_updated)
+ isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
-/*
- * Frequency meter - How fast is some event occurring?
- *
- * These routines manage a digitally filtered, constant time based,
- * event frequency meter. There are four routines:
- * fmeter_init() - initialize a frequency meter.
- * fmeter_markevent() - called each time the event happens.
- * fmeter_getrate() - returns the recent rate of such events.
- * fmeter_update() - internal routine used to update fmeter.
- *
- * A common data structure is passed to each of these routines,
- * which is used to keep track of the state required to manage the
- * frequency meter and its digital filter.
- *
- * The filter works on the number of events marked per unit time.
- * The filter is single-pole low-pass recursive (IIR). The time unit
- * is 1 second. Arithmetic is done using 32-bit integers scaled to
- * simulate 3 decimal digits of precision (multiplied by 1000).
- *
- * With an FM_COEF of 933, and a time base of 1 second, the filter
- * has a half-life of 10 seconds, meaning that if the events quit
- * happening, then the rate returned from the fmeter_getrate()
- * will be cut in half each 10 seconds, until it converges to zero.
- *
- * It is not worth doing a real infinitely recursive filter. If more
- * than FM_MAXTICKS ticks have elapsed since the last filter event,
- * just compute FM_MAXTICKS ticks worth, by which point the level
- * will be stable.
- *
- * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
- * arithmetic overflow in the fmeter_update() routine.
- *
- * Given the simple 32 bit integer arithmetic used, this meter works
- * best for reporting rates between one per millisecond (msec) and
- * one per 32 (approx) seconds. At constant rates faster than one
- * per msec it maxes out at values just under 1,000,000. At constant
- * rates between one per msec, and one per second it will stabilize
- * to a value N*1000, where N is the rate of events per second.
- * At constant rates between one per second and one per 32 seconds,
- * it will be choppy, moving up on the seconds that have an event,
- * and then decaying until the next event. At rates slower than
- * about one in 32 seconds, it decays all the way back to zero between
- * each event.
- */
-
-#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
-#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
-#define FM_SCALE 1000 /* faux fixed point scale */
-
-/* Initialize a frequency meter */
-static void fmeter_init(struct fmeter *fmp)
-{
- fmp->cnt = 0;
- fmp->val = 0;
- fmp->time = 0;
- spin_lock_init(&fmp->lock);
-}
-
-/* Internal meter update - process cnt events and update value */
-static void fmeter_update(struct fmeter *fmp)
-{
- time64_t now;
- u32 ticks;
-
- now = ktime_get_seconds();
- ticks = now - fmp->time;
-
- if (ticks == 0)
- return;
+ /* Force update if switching back to member & update effective_xcpus */
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+
+ /* A newly created partition must have effective_xcpus set */
+ WARN_ON_ONCE(!old_prs && (new_prs > 0)
+ && cpumask_empty(cs->effective_xcpus));
- ticks = min(FM_MAXTICKS, ticks);
- while (ticks-- > 0)
- fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
- fmp->time = now;
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
- fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
- fmp->cnt = 0;
+ notify_partition_change(cs, old_prs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+ free_tmpmasks(&tmpmask);
+ return 0;
}
-/* Process any previous ticks, then bump cnt by one (times scale). */
-static void fmeter_markevent(struct fmeter *fmp)
+static struct cpuset *cpuset_attach_old_cs;
+
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
{
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
- spin_unlock(&fmp->lock);
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
}
-/* Process any previous ticks, then return current value. */
-static int fmeter_getrate(struct fmeter *fmp)
+static void reset_migrate_dl_data(struct cpuset *cs)
{
- int val;
-
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- val = fmp->val;
- spin_unlock(&fmp->lock);
- return val;
+ cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
}
-static struct cpuset *cpuset_attach_old_cs;
-
-/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
+ struct cpuset *cs, *oldcs;
struct task_struct *task;
+ bool cpus_updated, mems_updated;
int ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = cpuset_attach_old_cs;
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!is_in_v2_mode() &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
goto out_unlock;
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (dl_task(task)) {
+ cs->nr_migrate_dl_tasks++;
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
+ }
+
+ if (!cs->nr_migrate_dl_tasks)
+ goto out_success;
+
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
goto out_unlock;
+ }
}
+out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
- ret = 0;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
+ struct cpuset *cs;
cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
+ dec_attach_in_progress_locked(cs);
- percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
- percpu_up_write(&cpuset_rwsem);
+ if (cs->nr_migrate_dl_tasks) {
+ int cpu = cpumask_any(cs->effective_cpus);
+
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ reset_migrate_dl_data(cs);
+ }
+
+ mutex_unlock(&cpuset_mutex);
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ if (cs != &top_cpuset)
+ guarantee_active_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ subpartitions_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset1_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_rwsem */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
+ mutex_lock(&cpuset_mutex);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cpuset_v2() && !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
- cgroup_taskset_for_each(task, css, tset) {
- if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
- else
- cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2290,160 +3337,51 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs))
+ if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- else
+ queue_task_work = true;
+ } else
mmput(mm);
}
}
+out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
-
- percpu_up_write(&cpuset_rwsem);
-}
-
-/* The various types of files and directories in a cpuset file system */
-
-typedef enum {
- FILE_MEMORY_MIGRATE,
- FILE_CPULIST,
- FILE_MEMLIST,
- FILE_EFFECTIVE_CPULIST,
- FILE_EFFECTIVE_MEMLIST,
- FILE_SUBPARTS_CPULIST,
- FILE_CPU_EXCLUSIVE,
- FILE_MEM_EXCLUSIVE,
- FILE_MEM_HARDWALL,
- FILE_SCHED_LOAD_BALANCE,
- FILE_PARTITION_ROOT,
- FILE_SCHED_RELAX_DOMAIN_LEVEL,
- FILE_MEMORY_PRESSURE_ENABLED,
- FILE_MEMORY_PRESSURE,
- FILE_SPREAD_PAGE,
- FILE_SPREAD_SLAB,
-} cpuset_filetype_t;
-
-static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = 0;
-
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
- if (!is_cpuset_online(cs)) {
- retval = -ENODEV;
- goto out_unlock;
- }
-
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_EXCLUSIVE:
- retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_HARDWALL:
- retval = update_flag(CS_MEM_HARDWALL, cs, val);
- break;
- case FILE_SCHED_LOAD_BALANCE:
- retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
- break;
- case FILE_MEMORY_MIGRATE:
- retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
- break;
- case FILE_MEMORY_PRESSURE_ENABLED:
- cpuset_memory_pressure_enabled = !!val;
- break;
- case FILE_SPREAD_PAGE:
- retval = update_flag(CS_SPREAD_PAGE, cs, val);
- break;
- case FILE_SPREAD_SLAB:
- retval = update_flag(CS_SPREAD_SLAB, cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
+ if (cs->nr_migrate_dl_tasks) {
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ reset_migrate_dl_data(cs);
}
-out_unlock:
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
- return retval;
-}
-static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
- s64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
-
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
- if (!is_cpuset_online(cs))
- goto out_unlock;
+ dec_attach_in_progress_locked(cs);
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- retval = update_relax_domain_level(cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-out_unlock:
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
- return retval;
+ mutex_unlock(&cpuset_mutex);
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
int retval = -ENODEV;
- buf = strstrip(buf);
-
- /*
- * CPU or memory hotunplug may leave @cs w/o any execution
- * resources, in which case the hotplug code asynchronously updates
- * configuration and transfers all tasks to the nearest ancestor
- * which can execute.
- *
- * As writes to "cpus" or "mems" may restore @cs's execution
- * resources, wait for the previously scheduled operations before
- * proceeding, so that we don't end up keep removing tasks added
- * after execution capability is restored.
- *
- * cpuset_hotplug_work calls back into cgroup core via
- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
- * operation like this one can lead to a deadlock through kernfs
- * active_ref protection. Let's break the protection. Losing the
- * protection is okay as we check whether @cs is online after
- * grabbing cpuset_rwsem anyway. This only happens on the legacy
- * hierarchies.
- */
- css_get(&cs->css);
- kernfs_break_active_protection(of->kn);
- flush_work(&cpuset_hotplug_work);
+ /* root is read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ buf = strstrip(buf);
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
@@ -2453,6 +3391,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ retval = update_exclusive_cpumask(cs, trialcs, buf);
+ break;
case FILE_MEMLIST:
retval = update_nodemask(cs, trialcs, buf);
break;
@@ -2462,12 +3403,12 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
- kernfs_unbreak_active_protection(of->kn);
- css_put(&cs->css);
- flush_workqueue(cpuset_migrate_mm_wq);
+ cpuset_full_unlock();
+ if (of_cft(of)->private == FILE_MEMLIST)
+ schedule_flush_migrate_mm();
return retval ?: nbytes;
}
@@ -2479,7 +3420,7 @@ out_unlock:
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
*/
-static int cpuset_common_seq_show(struct seq_file *sf, void *v)
+int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
@@ -2500,8 +3441,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+ break;
+ case FILE_EFFECTIVE_XCPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+ break;
case FILE_SUBPARTS_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
+ break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
break;
default:
ret = -EINVAL;
@@ -2511,71 +3461,38 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
return ret;
}
-static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- return is_cpu_exclusive(cs);
- case FILE_MEM_EXCLUSIVE:
- return is_mem_exclusive(cs);
- case FILE_MEM_HARDWALL:
- return is_mem_hardwall(cs);
- case FILE_SCHED_LOAD_BALANCE:
- return is_sched_load_balance(cs);
- case FILE_MEMORY_MIGRATE:
- return is_memory_migrate(cs);
- case FILE_MEMORY_PRESSURE_ENABLED:
- return cpuset_memory_pressure_enabled;
- case FILE_MEMORY_PRESSURE:
- return fmeter_getrate(&cs->fmeter);
- case FILE_SPREAD_PAGE:
- return is_spread_page(cs);
- case FILE_SPREAD_SLAB:
- return is_spread_slab(cs);
- default:
- BUG();
- }
-
- /* Unreachable but makes gcc happy */
- return 0;
-}
-
-static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- return cs->relax_domain_level;
- default:
- BUG();
- }
-
- /* Unreachable but makes gcc happy */
- return 0;
-}
-
-static int sched_partition_show(struct seq_file *seq, void *v)
+static int cpuset_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
}
-static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
@@ -2584,41 +3501,34 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
buf = strstrip(buf);
- /*
- * Convert "root" to ENABLED, and convert "member" to DISABLED.
- */
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
- css_get(&cs->css);
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- retval = update_prstate(cs, val);
-out_unlock:
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
- css_put(&cs->css);
+ cpuset_full_lock();
+ if (is_cpuset_online(cs))
+ retval = update_prstate(cs, val);
+ cpuset_full_unlock();
return retval ?: nbytes;
}
/*
- * for the common functions, 'private' gives the type of file
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
*/
-
-static struct cftype legacy_files[] = {
+static struct cftype dfl_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
},
{
@@ -2627,153 +3537,73 @@ static struct cftype legacy_files[] = {
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
},
{
- .name = "effective_cpus",
+ .name = "cpus.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
},
{
- .name = "effective_mems",
+ .name = "mems.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
},
{
- .name = "cpu_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_CPU_EXCLUSIVE,
- },
-
- {
- .name = "mem_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_EXCLUSIVE,
- },
-
- {
- .name = "mem_hardwall",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_HARDWALL,
- },
-
- {
- .name = "sched_load_balance",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SCHED_LOAD_BALANCE,
- },
-
- {
- .name = "sched_relax_domain_level",
- .read_s64 = cpuset_read_s64,
- .write_s64 = cpuset_write_s64,
- .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
- },
-
- {
- .name = "memory_migrate",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_MIGRATE,
- },
-
- {
- .name = "memory_pressure",
- .read_u64 = cpuset_read_u64,
- .private = FILE_MEMORY_PRESSURE,
- },
-
- {
- .name = "memory_spread_page",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_PAGE,
- },
-
- {
- .name = "memory_spread_slab",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_SLAB,
- },
-
- {
- .name = "memory_pressure_enabled",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
+ .name = "cpus.partition",
+ .seq_show = cpuset_partition_show,
+ .write = cpuset_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
},
- { } /* terminate */
-};
-
-/*
- * This is currently a minimal set for the default hierarchy. It can be
- * expanded later on by migrating more features and control files from v1.
- */
-static struct cftype dfl_files[] = {
{
- .name = "cpus",
+ .name = "cpus.exclusive",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
- .private = FILE_CPULIST,
+ .private = FILE_EXCLUSIVE_CPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
- .name = "mems",
+ .name = "cpus.exclusive.effective",
.seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * MAX_NUMNODES),
- .private = FILE_MEMLIST,
+ .private = FILE_EFFECTIVE_XCPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
- .name = "cpus.effective",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_CPULIST,
- },
-
- {
- .name = "mems.effective",
+ .name = "cpus.subpartitions",
.seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_MEMLIST,
- },
-
- {
- .name = "cpus.partition",
- .seq_show = sched_partition_show,
- .write = sched_partition_write,
- .private = FILE_PARTITION_ROOT,
- .flags = CFTYPE_NOT_ON_ROOT,
- .file_offset = offsetof(struct cpuset, partition_file),
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
{
- .name = "cpus.subpartitions",
+ .name = "cpus.isolated",
.seq_show = cpuset_common_seq_show,
- .private = FILE_SUBPARTS_CPULIST,
- .flags = CFTYPE_DEBUG,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
},
{ } /* terminate */
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -2782,23 +3612,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css)
return &top_cpuset.css;
- cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ cs = dup_or_alloc_cpuset(NULL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (alloc_cpumasks(cs, NULL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
-
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- nodes_clear(cs->mems_allowed);
- nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
/* Set CS_MEMORY_MIGRATE for default hierarchy */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (cpuset_v2())
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
@@ -2814,14 +3637,16 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
-
- set_bit(CS_ONLINE, &cs->flags);
+ cpuset_full_lock();
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cpuset_v2() && !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -2829,8 +3654,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
- cs->use_parent_ecpus = true;
- parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -2840,7 +3663,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -2866,8 +3689,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
+ cpuset_full_unlock();
return 0;
}
@@ -2877,37 +3699,33 @@ out_unlock:
* will call rebuild_sched_domains_locked(). That is not needed
* in the default hierarchy where only changes in partition
* will cause repartitioning.
- *
- * If the cpuset has the 'sched.partition' flag enabled, simulate
- * turning 'sched.partition" off.
*/
-
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
-
- if (is_partition_root(cs))
- update_prstate(cs, 0);
-
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- if (cs->use_parent_ecpus) {
- struct cpuset *parent = parent_cs(cs);
-
- cs->use_parent_ecpus = false;
- parent->child_ecpus_count--;
- }
+ cpuset_full_lock();
+ if (!cpuset_v2() && is_sched_load_balance(cs))
+ cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
- clear_bit(CS_ONLINE, &cs->flags);
+ cpuset_full_unlock();
+}
- percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
+/*
+ * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
+ * changing it back to member to free its exclusive CPUs back to the pool to
+ * be used by other online cpusets.
+ */
+static void cpuset_css_killed(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ cpuset_full_lock();
+ /* Reset valid partition back to member */
+ if (is_partition_valid(cs))
+ update_prstate(cs, PRS_MEMBER);
+ cpuset_full_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2919,11 +3737,12 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
cpumask_copy(top_cpuset.cpus_allowed,
@@ -2932,7 +3751,65 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ mutex_lock(&cpuset_mutex);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ dec_attach_in_progress(cs);
}
/*
@@ -2942,25 +3819,48 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ /* CLONE_INTO_CGROUP */
+ mutex_lock(&cpuset_mutex);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+
+ dec_attach_in_progress_locked(cs);
+ mutex_unlock(&cpuset_mutex);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
+ .css_killed = cpuset_css_killed,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
- .legacy_cftypes = legacy_files,
+#ifdef CONFIG_CPUSETS_V1
+ .legacy_cftypes = cpuset1_files,
+#endif
.dfl_cftypes = dfl_files,
.early_init = true,
.threaded = true,
@@ -2974,90 +3874,32 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
+ cpumask_setall(top_cpuset.effective_xcpus);
+ cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
- set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
- top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
- return 0;
-}
-
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
- struct cpuset *parent;
-
- /*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
- */
- parent = parent_cs(cs);
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent_cs(parent);
-
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
- pr_cont_cgroup_name(cs->css.cgroup);
- pr_cont("\n");
+ have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
+ if (have_boot_isolcpus) {
+ BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
+ cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
}
-}
-static void
-hotplug_update_tasks_legacy(struct cpuset *cs,
- struct cpumask *new_cpus, nodemask_t *new_mems,
- bool cpus_updated, bool mems_updated)
-{
- bool is_empty;
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->cpus_allowed, new_cpus);
- cpumask_copy(cs->effective_cpus, new_cpus);
- cs->mems_allowed = *new_mems;
- cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
-
- /*
- * Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
- */
- if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
- if (mems_updated && !nodes_empty(cs->mems_allowed))
- update_tasks_nodemask(cs);
-
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
-
- percpu_up_write(&cpuset_rwsem);
-
- /*
- * Move tasks to the nearest ancestor with execution resources,
- * This is full cgroup operation which will also call back into
- * cpuset. Should be done outside any lock.
- */
- if (is_empty)
- remove_tasks_in_empty_cpuset(cs);
-
- percpu_down_write(&cpuset_rwsem);
+ return 0;
}
static void
@@ -3065,7 +3907,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3076,16 +3919,14 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ cpuset_update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
- update_tasks_nodemask(cs);
+ cpuset_update_tasks_nodemask(cs);
}
-static bool force_rebuild;
-
void cpuset_force_rebuild(void)
{
- force_rebuild = true;
+ force_sd_rebuild = true;
}
/**
@@ -3103,18 +3944,20 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ bool remote;
+ int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
goto retry;
}
@@ -3122,68 +3965,56 @@ retry:
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
- if (cs->nr_subparts_cpus)
- /*
- * Make sure that CPUs allocated to child partitions
- * do not show up in effective_cpus.
- */
- cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
-
if (!tmp || !cs->partition_root_state)
goto update_tasks;
/*
- * In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * Compute effective_cpus for valid partition root, may invalidate
+ * child partition roots if necessary.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
- if (cs->nr_subparts_cpus) {
- spin_lock_irq(&callback_lock);
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
- spin_unlock_irq(&callback_lock);
- compute_effective_cpumask(&new_cpus, cs, parent);
- }
-
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- int old_prs;
-
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- old_prs = cs->partition_root_state;
- if (old_prs != PRS_ERROR) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = PRS_ERROR;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, PRS_ERROR);
- }
- }
- cpuset_force_rebuild();
+ remote = is_remote_partition(cs);
+ if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
+ compute_partition_effective_cpumask(cs, &new_cpus);
+
+ if (remote && cpumask_empty(&new_cpus) &&
+ partition_is_populated(cs, NULL)) {
+ cs->prs_err = PERR_HOTPLUG;
+ remote_partition_disable(cs, tmp);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ remote = false;
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ if (is_local_partition(cs) && (!is_partition_valid(parent) ||
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
+ /*
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one with a non-empty effective xcpus.
+ */
+ else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+ !cpumask_empty(cs->effective_xcpus))
+ partcmd = partcmd_update;
+
+ if (partcmd >= 0) {
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
+ compute_partition_effective_cpumask(cs, &new_cpus);
+ cpuset_force_rebuild();
+ }
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (!cpus_updated && !mems_updated)
+ goto unlock; /* Hotplug doesn't affect this cpuset */
if (mems_updated)
check_insane_mems_config(&new_mems);
@@ -3192,14 +4023,15 @@ update_tasks:
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
- hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
- percpu_up_write(&cpuset_rwsem);
+unlock:
+ mutex_unlock(&cpuset_mutex);
}
/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -3213,8 +4045,10 @@ update_tasks:
*
* Note that CPU offlining during suspend is ignored. We don't modify
* cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
*/
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -3222,49 +4056,43 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
- if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ if (on_dfl && !alloc_tmpmasks(&tmp))
ptmp = &tmp;
- percpu_down_write(&cpuset_rwsem);
+ lockdep_assert_cpus_held();
+ mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
/*
- * If subparts_cpus is populated, it is likely that the check below
- * will produce a false positive on cpus_updated when the cpu list
- * isn't changed. It is extra work, but it is better to be safe.
+ * If subpartitions_cpus is populated, it is likely that the check
+ * below will produce a false positive on cpus_updated when the cpu
+ * list isn't changed. It is extra work, but it is better to be safe.
*/
- cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+ !cpumask_empty(subpartitions_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
- /*
- * In the rare case that hotplug removes all the cpus in subparts_cpus,
- * we assumed that cpus are updated.
- */
- if (!cpus_updated && top_cpuset.nr_subparts_cpus)
- cpus_updated = true;
-
- /* synchronize cpus_allowed to cpu_active_mask */
+ /* For v1, synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
+ cpuset_force_rebuild();
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
/*
* Make sure that CPUs allocated to child partitions
* do not show up in effective_cpus. If no CPU is left,
- * we clear the subparts_cpus & let the child partitions
+ * we clear the subpartitions_cpus & let the child partitions
* fight for the CPUs again.
*/
- if (top_cpuset.nr_subparts_cpus) {
- if (cpumask_subset(&new_cpus,
- top_cpuset.subparts_cpus)) {
- top_cpuset.nr_subparts_cpus = 0;
- cpumask_clear(top_cpuset.subparts_cpus);
+ if (!cpumask_empty(subpartitions_cpus)) {
+ if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+ cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
- top_cpuset.subparts_cpus);
+ subpartitions_cpus);
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
@@ -3279,10 +4107,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
spin_unlock_irq(&callback_lock);
- update_tasks_nodemask(&top_cpuset);
+ cpuset_update_tasks_nodemask(&top_cpuset);
}
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
@@ -3303,13 +4131,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rcu_read_unlock();
}
- /* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated || force_rebuild) {
- force_rebuild = false;
- rebuild_sched_domains();
- }
+ /* rebuild sched domains if necessary */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_cpuslocked();
- free_cpumasks(NULL, ptmp);
+ free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
@@ -3319,12 +4145,7 @@ void cpuset_update_active_cpus(void)
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
*/
- schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
- flush_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
}
/*
@@ -3335,15 +4156,10 @@ void cpuset_wait_for_hotplug(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- schedule_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3351,28 +4167,73 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
*/
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
}
+/*
+ * Return cpus_allowed mask from a task's cpuset.
+ */
+static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+ struct cpuset *cs;
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_active_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
+ if (!cpumask_intersects(pmask, cpu_active_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+}
+
+/**
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Similir to cpuset_cpus_allowed() except that the caller must have acquired
+ * cpuset_mutex.
+ */
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+ lockdep_assert_held(&cpuset_mutex);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
+}
+
/**
- * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * subset of cpu_active_mask, even if this means going outside the
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
@@ -3380,7 +4241,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- guarantee_online_cpus(tsk, pmask);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3407,7 +4268,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
- do_set_cpus_allowed(tsk, cs_mask);
+ set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
@@ -3453,9 +4314,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
@@ -3485,8 +4344,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3525,7 +4384,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
@@ -3550,18 +4409,52 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ bool allowed;
+
+ /*
+ * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+ * and mems_allowed is likely to be empty even if we could get to it,
+ * so return true to avoid taking a global lock on the empty check.
+ */
+ if (!cpuset_v2())
+ return true;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return true;
+
+ /*
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but node_isset is atomic and the reference
+ * taken via cgroup_get_e_css is sufficient to protect css.
+ *
+ * Since this interface is intended for use by migration paths, we
+ * relax locking here to avoid taking global locks - while accepting
+ * there may be rare scenarios where the result may be innaccurate.
+ *
+ * Reclaim and migration are subject to these same race conditions, and
+ * cannot make strong isolation guarantees, so this is acceptable.
+ */
+ cs = container_of(css, struct cpuset, css);
+ allowed = node_isset(nid, cs->effective_mems);
+ css_put(css);
+ return allowed;
+}
+
/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ * cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -3585,12 +4478,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
-
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
@@ -3600,17 +4495,6 @@ int cpuset_mem_spread_node(void)
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
-int cpuset_slab_spread_node(void)
-{
- if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
- current->cpuset_slab_spread_rotor =
- node_random(&current->mems_allowed);
-
- return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
-}
-
-EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
-
/**
* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
* @tsk1: pointer to task_struct of some task.
@@ -3649,79 +4533,6 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_unlock();
}
-/*
- * Collection of memory_pressure is suppressed unless
- * this flag is enabled by writing "1" to the special
- * cpuset file 'memory_pressure_enabled' in the root cpuset.
- */
-
-int cpuset_memory_pressure_enabled __read_mostly;
-
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
- *
- * Keep a running average of the rate of synchronous (direct)
- * page reclaim efforts initiated by tasks in each cpuset.
- *
- * This represents the rate at which some task in the cpuset
- * ran low on memory on all nodes it was allowed to use, and
- * had to enter the kernels page reclaim code in an effort to
- * create more free memory by tossing clean pages or swapping
- * or writing dirty pages.
- *
- * Display to user space in the per-cpuset read-only file
- * "memory_pressure". Value displayed is an integer
- * representing the recent rate of entry into the synchronous
- * (direct) page reclaim by any task attached to the cpuset.
- **/
-
-void __cpuset_memory_pressure_bump(void)
-{
- rcu_read_lock();
- fmeter_markevent(&task_cs(current)->fmeter);
- rcu_read_unlock();
-}
-
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- * doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
- * anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf;
- struct cgroup_subsys_state *css;
- int retval;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- css = task_get_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- css_put(css);
- if (retval >= PATH_MAX)
- retval = -ENAMETOOLONG;
- if (retval < 0)
- goto out_free;
- seq_puts(m, buf);
- seq_putc(m, '\n');
- retval = 0;
-out_free:
- kfree(buf);
-out:
- return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..81ea38dd6f9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
return -ENODEV;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
@@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
css, css->id);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
cgroup_kn_unlock(of->kn);
return 0;
@@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name_buf);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
new file mode 100644
index 000000000000..e12b946278b6
--- /dev/null
+++ b/kernel/cgroup/dmem.c
@@ -0,0 +1,830 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
+ * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
+ * Partially based on the rdma and misc controllers, which bear the following copyrights:
+ *
+ * Copyright 2020 Google LLC
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_dmem.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/page_counter.h>
+#include <linux/parser.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+
+struct dmem_cgroup_region {
+ /**
+ * @ref: References keeping the region alive.
+ * Keeps the region reference alive after a succesful RCU lookup.
+ */
+ struct kref ref;
+
+ /** @rcu: RCU head for freeing */
+ struct rcu_head rcu;
+
+ /**
+ * @region_node: Linked into &dmem_cgroup_regions list.
+ * Protected by RCU and global spinlock.
+ */
+ struct list_head region_node;
+
+ /**
+ * @pools: List of pools linked to this region.
+ * Protected by global spinlock only
+ */
+ struct list_head pools;
+
+ /** @size: Size of region, in bytes */
+ u64 size;
+
+ /** @name: Name describing the node, set by dmem_cgroup_register_region */
+ char *name;
+
+ /**
+ * @unregistered: Whether the region is unregistered by its caller.
+ * No new pools should be added to the region afterwards.
+ */
+ bool unregistered;
+};
+
+struct dmemcg_state {
+ struct cgroup_subsys_state css;
+
+ struct list_head pools;
+};
+
+struct dmem_cgroup_pool_state {
+ struct dmem_cgroup_region *region;
+ struct dmemcg_state *cs;
+
+ /* css node, RCU protected against region teardown */
+ struct list_head css_node;
+
+ /* dev node, no RCU protection required */
+ struct list_head region_node;
+
+ struct rcu_head rcu;
+
+ struct page_counter cnt;
+
+ bool inited;
+};
+
+/*
+ * 3 operations require locking protection:
+ * - Registering and unregistering region to/from list, requires global lock.
+ * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
+ * - Adding a dmem_cgroup_pool_state to a region list.
+ *
+ * Since for the most common operations RCU provides enough protection, I
+ * do not think more granular locking makes sense. Most protection is offered
+ * by RCU and the lockless operating page_counter.
+ */
+static DEFINE_SPINLOCK(dmemcg_lock);
+static LIST_HEAD(dmem_cgroup_regions);
+
+static inline struct dmemcg_state *
+css_to_dmemcs(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct dmemcg_state, css);
+}
+
+static inline struct dmemcg_state *get_current_dmemcs(void)
+{
+ return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
+}
+
+static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
+{
+ return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
+}
+
+static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
+{
+ list_del(&pool->region_node);
+ kfree(pool);
+}
+
+static void
+set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_min(&pool->cnt, val);
+}
+
+static void
+set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_low(&pool->cnt, val);
+}
+
+static void
+set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_max(&pool->cnt, val);
+}
+
+static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.low) : 0;
+}
+
+static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.min) : 0;
+}
+
+static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
+}
+
+static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? page_counter_read(&pool->cnt) : 0;
+}
+
+static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
+{
+ set_resource_min(rpool, 0);
+ set_resource_low(rpool, 0);
+ set_resource_max(rpool, PAGE_COUNTER_MAX);
+}
+
+static void dmemcs_offline(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
+ reset_all_resource_limits(pool);
+ rcu_read_unlock();
+}
+
+static void dmemcs_free(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ spin_lock(&dmemcg_lock);
+ list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
+ /*
+ *The pool is dead and all references are 0,
+ * no need for RCU protection with list_del_rcu or freeing.
+ */
+ list_del(&pool->css_node);
+ free_cg_pool(pool);
+ }
+ spin_unlock(&dmemcg_lock);
+
+ kfree(dmemcs);
+}
+
+static struct cgroup_subsys_state *
+dmemcs_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
+ if (!dmemcs)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dmemcs->pools);
+ return &dmemcs->css;
+}
+
+static struct dmem_cgroup_pool_state *
+find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool;
+
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
+ if (pool->region == region)
+ return pool;
+
+ return NULL;
+}
+
+static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
+{
+ if (!pool->cnt.parent)
+ return NULL;
+
+ return container_of(pool->cnt.parent, typeof(*pool), cnt);
+}
+
+static void
+dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool)
+{
+ struct page_counter *climit;
+ struct cgroup_subsys_state *css;
+ struct dmemcg_state *dmemcg_iter;
+ struct dmem_cgroup_pool_state *pool, *found_pool;
+
+ climit = &limit_pool->cnt;
+
+ rcu_read_lock();
+
+ css_for_each_descendant_pre(css, &limit_pool->cs->css) {
+ dmemcg_iter = container_of(css, struct dmemcg_state, css);
+ found_pool = NULL;
+
+ list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
+ if (pool->region == limit_pool->region) {
+ found_pool = pool;
+ break;
+ }
+ }
+ if (!found_pool)
+ continue;
+
+ page_counter_calculate_protection(
+ climit, &found_pool->cnt, true);
+
+ if (found_pool == test_pool)
+ break;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
+ * @limit_pool: The pool for which we hit limits
+ * @test_pool: The pool for which to test
+ * @ignore_low: Whether we have to respect low watermarks.
+ * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
+ *
+ * This function returns true if we can evict from @test_pool, false if not.
+ * When returning false and @ignore_low is false, @ret_hit_low may
+ * be set to true to indicate this function can be retried with @ignore_low
+ * set to true.
+ *
+ * Return: bool
+ */
+bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool,
+ bool ignore_low, bool *ret_hit_low)
+{
+ struct dmem_cgroup_pool_state *pool = test_pool;
+ struct page_counter *ctest;
+ u64 used, min, low;
+
+ /* Can always evict from current pool, despite limits */
+ if (limit_pool == test_pool)
+ return true;
+
+ if (limit_pool) {
+ if (!parent_dmemcs(limit_pool->cs))
+ return true;
+
+ for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
+ {}
+
+ if (!pool)
+ return false;
+ } else {
+ /*
+ * If there is no cgroup limiting memory usage, use the root
+ * cgroup instead for limit calculations.
+ */
+ for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
+ {}
+ }
+
+ ctest = &test_pool->cnt;
+
+ dmem_cgroup_calculate_protection(limit_pool, test_pool);
+
+ used = page_counter_read(ctest);
+ min = READ_ONCE(ctest->emin);
+
+ if (used <= min)
+ return false;
+
+ if (!ignore_low) {
+ low = READ_ONCE(ctest->elow);
+ if (used > low)
+ return true;
+
+ *ret_hit_low = true;
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
+
+static struct dmem_cgroup_pool_state *
+alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmemcg_state *parent = parent_dmemcs(dmemcs);
+ struct dmem_cgroup_pool_state *pool, *ppool = NULL;
+
+ if (!*allocpool) {
+ pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ pool = *allocpool;
+ *allocpool = NULL;
+ }
+
+ pool->region = region;
+ pool->cs = dmemcs;
+
+ if (parent)
+ ppool = find_cg_pool_locked(parent, region);
+
+ page_counter_init(&pool->cnt,
+ ppool ? &ppool->cnt : NULL, true);
+ reset_all_resource_limits(pool);
+
+ list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
+ list_add_tail(&pool->region_node, &region->pools);
+
+ if (!parent)
+ pool->inited = true;
+ else
+ pool->inited = ppool ? ppool->inited : false;
+ return pool;
+}
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
+ struct dmemcg_state *p, *pp;
+
+ /*
+ * Recursively create pool, we may not initialize yet on
+ * recursion, this is done as a separate step.
+ */
+ for (p = dmemcs; p; p = parent_dmemcs(p)) {
+ pool = find_cg_pool_locked(p, region);
+ if (!pool)
+ pool = alloc_pool_single(p, region, allocpool);
+
+ if (IS_ERR(pool))
+ return pool;
+
+ if (p == dmemcs && pool->inited)
+ return pool;
+
+ if (pool->inited)
+ break;
+ }
+
+ retpool = pool = find_cg_pool_locked(dmemcs, region);
+ for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
+ if (pool->inited)
+ break;
+
+ /* ppool was created if it didn't exist by above loop. */
+ ppool = find_cg_pool_locked(pp, region);
+
+ /* Fix up parent links, mark as inited. */
+ pool->cnt.parent = &ppool->cnt;
+ pool->inited = true;
+
+ pool = ppool;
+ }
+
+ return retpool;
+}
+
+static void dmemcg_free_rcu(struct rcu_head *rcu)
+{
+ struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ list_for_each_entry_safe(pool, next, &region->pools, region_node)
+ free_cg_pool(pool);
+ kfree(region->name);
+ kfree(region);
+}
+
+static void dmemcg_free_region(struct kref *ref)
+{
+ struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
+
+ call_rcu(&cgregion->rcu, dmemcg_free_rcu);
+}
+
+/**
+ * dmem_cgroup_unregister_region() - Unregister a previously registered region.
+ * @region: The region to unregister.
+ *
+ * This function undoes dmem_cgroup_register_region.
+ */
+void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
+{
+ struct list_head *entry;
+
+ if (!region)
+ return;
+
+ spin_lock(&dmemcg_lock);
+
+ /* Remove from global region list */
+ list_del_rcu(&region->region_node);
+
+ list_for_each_rcu(entry, &region->pools) {
+ struct dmem_cgroup_pool_state *pool =
+ container_of(entry, typeof(*pool), region_node);
+
+ list_del_rcu(&pool->css_node);
+ }
+
+ /*
+ * Ensure any RCU based lookups fail. Additionally,
+ * no new pools should be added to the dead region
+ * by get_cg_pool_unlocked.
+ */
+ region->unregistered = true;
+ spin_unlock(&dmemcg_lock);
+
+ kref_put(&region->ref, dmemcg_free_region);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
+
+/**
+ * dmem_cgroup_register_region() - Register a regions for dev cgroup.
+ * @size: Size of region to register, in bytes.
+ * @fmt: Region parameters to register
+ *
+ * This function registers a node in the dmem cgroup with the
+ * name given. After calling this function, the region can be
+ * used for allocations.
+ *
+ * Return: NULL or a struct on success, PTR_ERR on failure.
+ */
+struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
+{
+ struct dmem_cgroup_region *ret;
+ char *region_name;
+ va_list ap;
+
+ if (!size)
+ return NULL;
+
+ va_start(ap, fmt);
+ region_name = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!region_name)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret) {
+ kfree(region_name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&ret->pools);
+ ret->name = region_name;
+ ret->size = size;
+ kref_init(&ret->ref);
+
+ spin_lock(&dmemcg_lock);
+ list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
+ spin_unlock(&dmemcg_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
+
+static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
+{
+ struct dmem_cgroup_region *region;
+
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
+ if (!strcmp(name, region->name) &&
+ kref_get_unless_zero(&region->ref))
+ return region;
+
+ return NULL;
+}
+
+/**
+ * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
+ * @pool: &dmem_cgroup_pool_state
+ *
+ * Called to drop a reference to the limiting pool returned by
+ * dmem_cgroup_try_charge().
+ */
+void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
+{
+ if (pool)
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
+
+ /* fastpath lookup? */
+ rcu_read_lock();
+ pool = find_cg_pool_locked(cg, region);
+ if (pool && !READ_ONCE(pool->inited))
+ pool = NULL;
+ rcu_read_unlock();
+
+ while (!pool) {
+ spin_lock(&dmemcg_lock);
+ if (!region->unregistered)
+ pool = get_cg_pool_locked(cg, region, &allocpool);
+ else
+ pool = ERR_PTR(-ENODEV);
+ spin_unlock(&dmemcg_lock);
+
+ if (pool == ERR_PTR(-ENOMEM)) {
+ pool = NULL;
+ if (WARN_ON(allocpool))
+ continue;
+
+ allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
+ if (allocpool) {
+ pool = NULL;
+ continue;
+ }
+ }
+ }
+
+ kfree(allocpool);
+ return pool;
+}
+
+/**
+ * dmem_cgroup_uncharge() - Uncharge a pool.
+ * @pool: Pool to uncharge.
+ * @size: Size to uncharge.
+ *
+ * Undoes the effects of dmem_cgroup_try_charge.
+ * Must be called with the returned pool as argument,
+ * and same @index and @size.
+ */
+void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
+{
+ if (!pool)
+ return;
+
+ page_counter_uncharge(&pool->cnt, size);
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
+
+/**
+ * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
+ * @region: dmem region to charge
+ * @size: Size (in bytes) to charge.
+ * @ret_pool: On succesfull allocation, the pool that is charged.
+ * @ret_limit_pool: On a failed allocation, the limiting pool.
+ *
+ * This function charges the @region region for a size of @size bytes.
+ *
+ * If the function succeeds, @ret_pool is set, which must be passed to
+ * dmem_cgroup_uncharge() when undoing the allocation.
+ *
+ * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
+ * will be set to the pool for which the limit is hit. This can be used for
+ * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
+ * with @dmem_cgroup_pool_state_put().
+ *
+ * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
+ */
+int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
+ struct dmem_cgroup_pool_state **ret_pool,
+ struct dmem_cgroup_pool_state **ret_limit_pool)
+{
+ struct dmemcg_state *cg;
+ struct dmem_cgroup_pool_state *pool;
+ struct page_counter *fail;
+ int ret;
+
+ *ret_pool = NULL;
+ if (ret_limit_pool)
+ *ret_limit_pool = NULL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_dmemcs();
+
+ pool = get_cg_pool_unlocked(cg, region);
+ if (IS_ERR(pool)) {
+ ret = PTR_ERR(pool);
+ goto err;
+ }
+
+ if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
+ if (ret_limit_pool) {
+ *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
+ css_get(&(*ret_limit_pool)->cs->css);
+ }
+ ret = -EAGAIN;
+ goto err;
+ }
+
+ /* On success, reference from get_current_dmemcs is transferred to *ret_pool */
+ *ret_pool = pool;
+ return 0;
+
+err:
+ css_put(&cg->css);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
+
+static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
+{
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ seq_puts(sf, region->name);
+ seq_printf(sf, " %llu\n", region->size);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
+ u64 *new_limit)
+{
+ char *end;
+
+ if (!strcmp(options, "max")) {
+ *new_limit = PAGE_COUNTER_MAX;
+ return 0;
+ }
+
+ *new_limit = memparse(options, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ void (*apply)(struct dmem_cgroup_pool_state *, u64))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
+ int err = 0;
+
+ while (buf && !err) {
+ struct dmem_cgroup_pool_state *pool = NULL;
+ char *options, *region_name;
+ struct dmem_cgroup_region *region;
+ u64 new_limit;
+
+ options = buf;
+ buf = strchr(buf, '\n');
+ if (buf)
+ *buf++ = '\0';
+
+ options = strstrip(options);
+
+ /* eat empty lines */
+ if (!options[0])
+ continue;
+
+ region_name = strsep(&options, " \t");
+ if (!region_name[0])
+ continue;
+
+ rcu_read_lock();
+ region = dmemcg_get_region_by_name(region_name);
+ rcu_read_unlock();
+
+ if (!region)
+ return -EINVAL;
+
+ err = dmemcg_parse_limit(options, region, &new_limit);
+ if (err < 0)
+ goto out_put;
+
+ pool = get_cg_pool_unlocked(dmemcs, region);
+ if (IS_ERR(pool)) {
+ err = PTR_ERR(pool);
+ goto out_put;
+ }
+
+ /* And commit */
+ apply(pool, new_limit);
+
+out_put:
+ kref_put(&region->ref, dmemcg_free_region);
+ }
+
+
+ return err ?: nbytes;
+}
+
+static int dmemcg_limit_show(struct seq_file *sf, void *v,
+ u64 (*fn)(struct dmem_cgroup_pool_state *))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
+ u64 val;
+
+ seq_puts(sf, region->name);
+
+ val = fn(pool);
+ if (val < PAGE_COUNTER_MAX)
+ seq_printf(sf, " %lld\n", val);
+ else
+ seq_puts(sf, " max\n");
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_current);
+}
+
+static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_min);
+}
+
+static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
+}
+
+static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_low);
+}
+
+static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
+}
+
+static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_max);
+}
+
+static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "capacity",
+ .seq_show = dmem_cgroup_region_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = dmem_cgroup_region_current_show,
+ },
+ {
+ .name = "min",
+ .write = dmem_cgroup_region_min_write,
+ .seq_show = dmem_cgroup_region_min_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "low",
+ .write = dmem_cgroup_region_low_write,
+ .seq_show = dmem_cgroup_region_low_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "max",
+ .write = dmem_cgroup_region_max_write,
+ .seq_show = dmem_cgroup_region_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* Zero entry terminates. */
+};
+
+struct cgroup_subsys dmem_cgrp_subsys = {
+ .css_alloc = dmemcs_alloc,
+ .css_free = dmemcs_free,
+ .css_offline = dmemcs_offline,
+ .legacy_cftypes = files,
+ .dfl_cftypes = files,
+};
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..6c18854bff34 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -9,6 +9,28 @@
#include <trace/events/cgroup.h>
/*
+ * Update CGRP_FROZEN of cgroup.flag
+ * Return true if flags is updated; false if flags has no change
+ */
+static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen)
+ return false;
+
+ if (frozen)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ else
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
+ return true;
+}
+
+/*
* Propagate the cgroup frozen state upwards by the cgroup tree.
*/
static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
@@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
while ((cgrp = cgroup_parent(cgrp))) {
if (frozen) {
cgrp->freezer.nr_frozen_descendants += desc;
- if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
- test_bit(CGRP_FREEZE, &cgrp->flags) &&
- cgrp->freezer.nr_frozen_descendants ==
- cgrp->nr_descendants) {
- set_bit(CGRP_FROZEN, &cgrp->flags);
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
- desc++;
- }
+ if (!test_bit(CGRP_FREEZE, &cgrp->flags) ||
+ (cgrp->freezer.nr_frozen_descendants !=
+ cgrp->nr_descendants))
+ continue;
} else {
cgrp->freezer.nr_frozen_descendants -= desc;
- if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
- clear_bit(CGRP_FROZEN, &cgrp->flags);
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
- desc++;
- }
}
+
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ desc++;
}
}
@@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp)
{
bool frozen;
- lockdep_assert_held(&css_set_lock);
-
/*
* If the cgroup has to be frozen (CGRP_FREEZE bit set),
* and all tasks are frozen and/or stopped, let's consider
@@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp)
frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
- if (frozen) {
- /* Already there? */
- if (test_bit(CGRP_FROZEN, &cgrp->flags))
- return;
-
- set_bit(CGRP_FROZEN, &cgrp->flags);
- } else {
- /* Already there? */
- if (!test_bit(CGRP_FROZEN, &cgrp->flags))
- return;
-
- clear_bit(CGRP_FROZEN, &cgrp->flags);
- }
- cgroup_file_notify(&cgrp->events_file);
- TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
-
- /* Update the state of ancestor cgroups. */
- cgroup_propagate_frozen(cgrp, frozen);
+ /* If flags is updated, update the state of ancestor cgroups. */
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ cgroup_propagate_frozen(cgrp, frozen);
}
/*
@@ -174,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
/*
* Freeze or unfreeze all tasks in the given cgroup.
*/
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
{
struct css_task_iter it;
struct task_struct *task;
@@ -182,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- if (freeze)
+ write_seqcount_begin(&cgrp->freezer.freeze_seq);
+ if (freeze) {
set_bit(CGRP_FREEZE, &cgrp->flags);
- else
+ cgrp->freezer.freeze_start_nsec = ts_nsec;
+ } else {
clear_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.frozen_nsec += (ts_nsec -
+ cgrp->freezer.freeze_start_nsec);
+ }
+ write_seqcount_end(&cgrp->freezer.freeze_seq);
spin_unlock_irq(&css_set_lock);
if (freeze)
@@ -260,8 +263,11 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
void cgroup_freeze(struct cgroup *cgrp, bool freeze)
{
struct cgroup_subsys_state *css;
+ struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
+ u64 ts_nsec;
+ bool old_e;
lockdep_assert_held(&cgroup_mutex);
@@ -272,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
return;
cgrp->freezer.freeze = freeze;
+ ts_nsec = ktime_get_ns();
/*
* Propagate changes downwards the cgroup tree.
@@ -282,28 +289,24 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
if (cgroup_is_dead(dsct))
continue;
- if (freeze) {
- dsct->freezer.e_freeze++;
- /*
- * Already frozen because of ancestor's settings?
- */
- if (dsct->freezer.e_freeze > 1)
- continue;
- } else {
- dsct->freezer.e_freeze--;
- /*
- * Still frozen because of ancestor's settings?
- */
- if (dsct->freezer.e_freeze > 0)
- continue;
-
- WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+ /*
+ * e_freeze is affected by parent's e_freeze and dst's freeze.
+ * If old e_freeze eq new e_freeze, no change, its children
+ * will not be affected. So do nothing and skip the subtree
+ */
+ old_e = dsct->freezer.e_freeze;
+ parent = cgroup_parent(dsct);
+ dsct->freezer.e_freeze = (dsct->freezer.freeze ||
+ parent->freezer.e_freeze);
+ if (dsct->freezer.e_freeze == old_e) {
+ css = css_rightmost_descendant(css);
+ continue;
}
/*
* Do change actual state: freeze or unfreeze.
*/
- cgroup_do_freeze(dsct, freeze);
+ cgroup_do_freeze(dsct, freeze, ts_nsec);
applied = true;
}
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..915b02f65980 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -62,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer)
return css_freezer(freezer->css.parent);
}
-bool cgroup_freezing(struct task_struct *task)
+bool cgroup1_freezing(struct task_struct *task)
{
bool ret;
@@ -99,24 +100,25 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
* @css: css being created
*
* We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+ * parent's freezing state while holding cpus read lock and freezer_mutex.
*/
static int freezer_css_online(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
}
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -124,21 +126,23 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
* freezer_css_offline - initiate destruction of a freezer css
* @css: css being destroyed
*
- * @css is going away. Mark it dead and decrement system_freezing_count if
+ * @css is going away. Mark it dead and decrement freezer_active if
* it was holding one.
*/
static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec_cpuslocked(&freezer_active);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -178,12 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
- freeze_task(task);
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
freezer = parent_freezer(freezer);
}
+ freeze_task(task);
}
}
@@ -271,16 +275,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +362,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -386,6 +382,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -414,6 +411,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
@@ -425,9 +423,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of,
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) {
+ pr_info_once("Freezing with imperfect legacy cgroup freezer. "
+ "See cgroup.freeze of cgroup v2\n");
freeze = true;
- else
+ } else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index fe3e8a0eb7ed..6a01d91ea4cb 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -14,7 +14,7 @@
#include <linux/misc_cgroup.h>
#define MAX_STR "max"
-#define MAX_NUM ULONG_MAX
+#define MAX_NUM U64_MAX
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
static const char *const misc_res_name[] = {
@@ -24,6 +24,10 @@ static const char *const misc_res_name[] = {
/* AMD SEV-ES ASIDs resource */
"sev_es",
#endif
+#ifdef CONFIG_INTEL_TDX_HOST
+ /* Intel TDX HKIDs resource */
+ "tdx",
+#endif
};
/* Root misc cgroup */
@@ -37,7 +41,7 @@ static struct misc_cg root_cg;
* more than the actual capacity. We are using Limits resource distribution
* model of cgroup for miscellaneous controller.
*/
-static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+static u64 misc_res_capacity[MISC_CG_RES_TYPES];
/**
* parent_misc() - Get the parent of the passed misc cgroup.
@@ -68,22 +72,6 @@ static inline bool valid_type(enum misc_res_type type)
}
/**
- * misc_cg_res_total_usage() - Get the current total usage of the resource.
- * @type: misc res type.
- *
- * Context: Any context.
- * Return: Current total usage of the resource.
- */
-unsigned long misc_cg_res_total_usage(enum misc_res_type type)
-{
- if (valid_type(type))
- return atomic_long_read(&root_cg.res[type].usage);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
-
-/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
* @capacity: Supported capacity of the misc res on the host.
@@ -95,7 +83,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
* * %0 - Successfully registered the capacity.
* * %-EINVAL - If @type is invalid.
*/
-int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
if (!valid_type(type))
return -EINVAL;
@@ -114,13 +102,37 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
* Context: Any context.
*/
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+ u64 amount)
{
- WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+ WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
"misc cgroup resource %s became less than 0",
misc_res_name[type]);
}
+static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage)
+{
+ u64 old;
+
+ while (true) {
+ old = atomic64_read(&res->watermark);
+ if (new_usage <= old)
+ break;
+ if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old)
+ break;
+ }
+}
+
+static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg)
+{
+ atomic64_inc(&cg->res[type].events_local);
+ cgroup_file_notify(&cg->events_local_file);
+
+ for (; parent_misc(cg); cg = parent_misc(cg)) {
+ atomic64_inc(&cg->res[type].events);
+ cgroup_file_notify(&cg->events_file);
+ }
+}
+
/**
* misc_cg_try_charge() - Try charging the misc cgroup.
* @type: Misc res type to charge.
@@ -137,13 +149,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
* * -EBUSY - If max limit will be crossed or total usage will be more than the
* capacity.
*/
-int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i, *j;
int ret;
struct misc_res *res;
- int new_usage;
+ u64 new_usage;
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
return -EINVAL;
@@ -154,20 +165,18 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
for (i = cg; i; i = parent_misc(i)) {
res = &i->res[type];
- new_usage = atomic_long_add_return(amount, &res->usage);
+ new_usage = atomic64_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
ret = -EBUSY;
goto err_charge;
}
+ misc_cg_update_watermark(res, new_usage);
}
return 0;
err_charge:
- for (j = i; j; j = parent_misc(j)) {
- atomic_long_inc(&j->res[type].events);
- cgroup_file_notify(&j->events_file);
- }
+ misc_cg_event(type, i);
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
@@ -184,8 +193,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
*
* Context: Any context.
*/
-void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i;
@@ -209,7 +217,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
{
int i;
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long max;
+ u64 max;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
if (READ_ONCE(misc_res_capacity[i])) {
@@ -217,7 +225,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
if (max == MAX_NUM)
seq_printf(sf, "%s max\n", misc_res_name[i]);
else
- seq_printf(sf, "%s %lu\n", misc_res_name[i],
+ seq_printf(sf, "%s %llu\n", misc_res_name[i],
max);
}
}
@@ -241,13 +249,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
* Return:
* * >= 0 - Number of bytes processed in the input.
* * -EINVAL - If buf is not valid.
- * * -ERANGE - If number is bigger than the unsigned long capacity.
+ * * -ERANGE - If number is bigger than the u64 capacity.
*/
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct misc_cg *cg;
- unsigned long max;
+ u64 max;
int ret = 0, i;
enum misc_res_type type = MISC_CG_RES_TYPES;
char *token;
@@ -271,7 +279,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
if (!strcmp(MAX_STR, buf)) {
max = MAX_NUM;
} else {
- ret = kstrtoul(buf, 0, &max);
+ ret = kstrtou64(buf, 0, &max);
if (ret)
return ret;
}
@@ -297,13 +305,36 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
static int misc_cg_current_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long usage;
+ u64 usage;
struct misc_cg *cg = css_misc(seq_css(sf));
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- usage = atomic_long_read(&cg->res[i].usage);
+ usage = atomic64_read(&cg->res[i].usage);
if (READ_ONCE(misc_res_capacity[i]) || usage)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_peak_show() - Show the peak usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_peak_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 watermark;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ watermark = atomic64_read(&cg->res[i].watermark);
+ if (READ_ONCE(misc_res_capacity[i]) || watermark)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark);
}
return 0;
@@ -322,30 +353,44 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long cap;
+ u64 cap;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
cap = READ_ONCE(misc_res_capacity[i]);
if (cap)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
}
return 0;
}
-static int misc_events_show(struct seq_file *sf, void *v)
+static int __misc_events_show(struct seq_file *sf, bool local)
{
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long events, i;
+ u64 events;
+ int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic_long_read(&cg->res[i].events);
+ if (local)
+ events = atomic64_read(&cg->res[i].events_local);
+ else
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
- seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, false);
+}
+
+static int misc_events_local_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, true);
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -357,7 +402,10 @@ static struct cftype misc_cg_files[] = {
{
.name = "current",
.seq_show = misc_cg_current_show,
- .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .seq_show = misc_cg_peak_show,
},
{
.name = "capacity",
@@ -370,6 +418,12 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_file),
.seq_show = misc_events_show,
},
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_local_file),
+ .seq_show = misc_events_local_show,
+ },
{}
};
@@ -398,7 +452,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
WRITE_ONCE(cg->res[i].max, MAX_NUM);
- atomic_long_set(&cg->res[i].usage, 0);
+ atomic64_set(&cg->res[i].usage, 0);
}
return &cg->css;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 0d5c29879a50..db9617556dd7 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,33 +21,31 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(new_ns);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
@@ -87,14 +85,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
+ ns_tree_add(new_ns);
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,15 +137,8 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
- .type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
.owner = cgroupns_owner,
};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..8f61114c36dd 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_FORKFAIL,
+ NR_PIDCG_EVENTS,
+};
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -47,12 +55,14 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
- /* Handle for "pids.events" */
+ /* Handles for pids.events[.local] */
struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
- /* Number of times fork failed because limit was hit. */
- atomic64_t events_limit;
+ atomic64_t events[NR_PIDCG_EVENTS];
+ atomic64_t events_local[NR_PIDCG_EVENTS];
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -74,9 +84,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- atomic64_set(&pids->counter, 0);
atomic64_set(&pids->limit, PIDS_MAX);
- atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -85,6 +93,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -128,20 +146,24 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
* pids_try_charge - hierarchically try to charge the pid count
* @pids: the pid cgroup state
* @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
* succeeded, otherwise -EAGAIN.
*/
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
{
struct pids_cgroup *p, *q;
@@ -154,8 +176,15 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > limit)
+ if (new > limit) {
+ *fail = p;
goto revert;
+ }
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -211,44 +240,54 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
}
}
+static void pids_event(struct pids_cgroup *pids_forking,
+ struct pids_cgroup *pids_over_limit)
+{
+ struct pids_cgroup *p = pids_forking;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(p->css.cgroup);
+ pr_cont("\n");
+ }
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ cgroup_file_notify(&p->events_local_file);
+ return;
+ }
+
+ atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&pids_over_limit->events_local_file);
+
+ for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) {
+ atomic64_inc(&p->events[PIDCG_MAX]);
+ cgroup_file_notify(&p->events_file);
+ }
+}
+
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
*/
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
- struct cgroup_subsys_state *css;
- struct pids_cgroup *pids;
+ struct pids_cgroup *pids, *pids_over_limit;
int err;
- if (cset)
- css = cset->subsys[pids_cgrp_id];
- else
- css = task_css_check(current, pids_cgrp_id, true);
- pids = css_pids(css);
- err = pids_try_charge(pids, 1);
- if (err) {
- /* Only log the first time events_limit is incremented. */
- if (atomic64_inc_return(&pids->events_limit) == 1) {
- pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(css->cgroup);
- pr_cont("\n");
- }
- cgroup_file_notify(&pids->events_file);
- }
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
+ err = pids_try_charge(pids, 1, &pids_over_limit);
+ if (err)
+ pids_event(pids, pids_over_limit);
+
return err;
}
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
- struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
- if (cset)
- css = cset->subsys[pids_cgrp_id];
- else
- css = task_css_check(current, pids_cgrp_id, true);
- pids = css_pids(css);
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
pids_uncharge(pids, 1);
}
@@ -311,11 +350,40 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
-static int pids_events_show(struct seq_file *sf, void *v)
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
+static int __pids_events_show(struct seq_file *sf, bool local)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
+ enum pidcg_event pe = PIDCG_MAX;
+ atomic64_t *events;
+
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ pe = PIDCG_FORKFAIL;
+ local = true;
+ }
+ events = local ? pids->events_local : pids->events;
- seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
+ return 0;
+}
+
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, false);
+ return 0;
+}
+
+static int pids_events_local_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, true);
return 0;
}
@@ -332,14 +400,52 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events.local",
+ .seq_show = pids_events_local_show,
+ .file_offset = offsetof(struct pids_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
@@ -348,7 +454,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.release = pids_release,
- .legacy_cftypes = pids_files,
+ .legacy_cftypes = pids_files_legacy,
.dfl_cftypes = pids_files,
.threaded = true,
};
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 3135406608c7..ef5878fb2005 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
* stop uncharging
@@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @index: index of the resource to uncharge in cgroup in given resource pool
*/
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..a198e40c799b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,48 +3,132 @@
#include <linux/sched/cputime.h>
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
+#include <trace/events/cgroup.h>
+
+static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
-static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+/*
+ * Determines whether a given css can participate in rstat.
+ * css's that are cgroup::self use rstat for base stats.
+ * Other css's associated with a subsystem use rstat only when
+ * they define the ss->css_rstat_flush callback.
+ */
+static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
+{
+ return css_is_self(css) || css->ss->css_rstat_flush != NULL;
+}
+
+static struct css_rstat_cpu *css_rstat_cpu(
+ struct cgroup_subsys_state *css, int cpu)
+{
+ return per_cpu_ptr(css->rstat_cpu, cpu);
+}
+
+static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
+ struct cgroup *cgrp, int cpu)
{
- return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+ return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
+}
+
+static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+{
+ if (ss)
+ return &ss->rstat_ss_lock;
+
+ return &rstat_base_lock;
+}
+
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
}
/**
- * cgroup_rstat_updated - keep track of updated rstat_cpu
- * @cgrp: target cgroup
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * cgroup_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
*/
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- unsigned long flags;
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
/*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
+ /*
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
*
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @cgrp is on the list by
- * testing the next pointer for NULL.
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
*/
- if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
return;
- raw_spin_lock_irqsave(cpu_lock, flags);
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
- /* put @cgrp and all ancestors on the corresponding updated lists */
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
+ /* put @css and all ancestors on the corresponding updated lists */
while (true) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_rstat_cpu *prstatc;
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+ struct cgroup_subsys_state *parent = css->parent;
+ struct css_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -55,82 +139,169 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
/* Root has no parent to link it to, but mark it busy */
if (!parent) {
- rstatc->updated_next = cgrp;
+ rstatc->updated_next = css;
break;
}
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
- prstatc->updated_children = cgrp;
+ prstatc->updated_children = css;
- cgrp = parent;
+ css = parent;
}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
+
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
}
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * css_rstat_push_children - push children css's into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of css's to be flushed
*
- * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
- *
- * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * Iteratively traverse down the css_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list via the rstat_flush_next pointer built from the
+ * tail backward like "pushing" css's into a stack. The root is pushed by
+ * the caller.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup_subsys_state *css_rstat_push_children(
+ struct cgroup_subsys_state *head,
+ struct cgroup_subsys_state *child, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
+ struct cgroup_subsys_state *cnext = child; /* Next head of child css level */
+ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */
+ struct cgroup_subsys_state *parent, *grandchild;
+ struct css_rstat_cpu *crstatc;
- if (pos == root)
- return NULL;
+ child->rstat_flush_next = NULL;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The subsystem rstat lock must be held for the whole duration from
+ * here as the rstat_flush_next list is being constructed to when
+ * it is consumed later in css_rstat_flush().
*/
- if (!pos) {
- pos = root;
- /* return NULL if this subtree is not on-list */
- if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
- return NULL;
- } else {
- pos = cgroup_parent(pos);
+ lockdep_assert_held(ss_rstat_lock(head->ss));
+
+ /*
+ * Notation: -> updated_next pointer
+ * => rstat_flush_next pointer
+ *
+ * Assuming the following sample updated_children lists:
+ * P: C1 -> C2 -> P
+ * C1: G11 -> G12 -> C1
+ * C2: G21 -> G22 -> C2
+ *
+ * After 1st iteration:
+ * head => C2 => C1 => NULL
+ * ghead => G21 => G11 => NULL
+ *
+ * After 2nd iteration:
+ * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
+ */
+next_level:
+ while (cnext) {
+ child = cnext;
+ cnext = child->rstat_flush_next;
+ parent = child->parent;
+
+ /* updated_next is parent cgroup terminated if !NULL */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = css_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
}
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
+ if (ghead) {
+ cnext = ghead;
+ ghead = NULL;
+ goto next_level;
}
+ return head;
+}
+
+/**
+ * css_rstat_updated_list - build a list of updated css's to be flushed
+ * @root: root of the css subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of css's to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned css is unlinked from the updated tree.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child css's if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent css. An exception
+ * here is the css root whose updated_next can be self terminated.
+ */
+static struct cgroup_subsys_state *css_rstat_updated_list(
+ struct cgroup_subsys_state *root, int cpu)
+{
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
+ struct cgroup_subsys_state *head = NULL, *parent, *child;
+
+ css_process_update_tree(root->ss, cpu);
+
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ return NULL;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
+ parent = root->parent;
if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
+ struct css_rstat_cpu *prstatc;
+ struct cgroup_subsys_state **nextp;
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (*nextp != pos) {
- struct cgroup_rstat_cpu *nrstatc;
+ while (*nextp != root) {
+ struct css_rstat_cpu *nrstatc;
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ nrstatc = css_rstat_cpu(*nextp, cpu);
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
@@ -138,157 +309,224 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
}
rstatc->updated_next = NULL;
- return pos;
-}
-/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
- __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_rstat_lock);
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = css_rstat_push_children(head, child, cpu);
- for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
+ return head;
+}
- raw_spin_lock(cpu_lock);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
- struct cgroup_subsys_state *css;
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for css_rstat_updated() and
+ * css_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ */
- cgroup_base_stat_flush(pos, cpu);
+__bpf_hook_start();
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
- }
- raw_spin_unlock(cpu_lock);
-
- /* if @may_sleep, play nice and yield if necessary */
- if (may_sleep && (need_resched() ||
- spin_needbreak(&cgroup_rstat_lock))) {
- spin_unlock_irq(&cgroup_rstat_lock);
- if (!cond_resched())
- cpu_relax();
- spin_lock_irq(&cgroup_rstat_lock);
- }
- }
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
}
-/**
- * cgroup_rstat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
- *
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards. After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
- *
- * This also gets all cgroups in the subtree including @cgrp off the
- * ->updated_children lists.
+__bpf_hook_end();
+
+/*
+ * Helper functions for locking.
*
- * This function may block.
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
*/
-void cgroup_rstat_flush(struct cgroup *cgrp)
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __acquires(ss_rstat_lock(css->ss))
{
- might_sleep();
-
- spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
- spin_unlock_irq(&cgroup_rstat_lock);
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
+ bool contended;
+
+ lock = ss_rstat_lock(css->ss);
+ contended = !spin_trylock_irq(lock);
+ if (contended) {
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+ spin_lock_irq(lock);
+ }
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}
-/**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
- * @cgrp: target cgroup
- *
- * This function can be called from any context.
- */
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __releases(ss_rstat_lock(css->ss))
{
- unsigned long flags;
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
- spin_lock_irqsave(&cgroup_rstat_lock, flags);
- cgroup_rstat_flush_locked(cgrp, false);
- spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
+ lock = ss_rstat_lock(css->ss);
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+ spin_unlock_irq(lock);
}
/**
- * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
- * @cgrp: target cgroup
+ * css_rstat_flush - flush stats in @css's rstat subtree
+ * @css: target cgroup subsystem state
*
- * Flush stats in @cgrp's subtree and prevent further flushes. Must be
- * paired with cgroup_rstat_flush_release().
+ * Collect all per-cpu stats in @css's subtree into the global counters
+ * and propagate them upwards. After this function returns, all rstat
+ * nodes in the subtree have up-to-date ->stat.
+ *
+ * This also gets all rstat nodes in the subtree including @css off the
+ * ->updated_children lists.
*
* This function may block.
*/
-void cgroup_rstat_flush_hold(struct cgroup *cgrp)
- __acquires(&cgroup_rstat_lock)
+__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
{
- might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
-}
+ int cpu;
+ bool is_self = css_is_self(css);
-/**
- * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
- */
-void cgroup_rstat_flush_release(void)
- __releases(&cgroup_rstat_lock)
-{
- spin_unlock_irq(&cgroup_rstat_lock);
+ /*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ might_sleep();
+ for_each_possible_cpu(cpu) {
+ struct cgroup_subsys_state *pos;
+
+ /* Reacquire for each CPU to avoid disabling IRQs too long */
+ __css_rstat_lock(css, cpu);
+ pos = css_rstat_updated_list(css, cpu);
+ for (; pos; pos = pos->rstat_flush_next) {
+ if (is_self) {
+ cgroup_base_stat_flush(pos->cgroup, cpu);
+ bpf_rstat_flush(pos->cgroup,
+ cgroup_parent(pos->cgroup), cpu);
+ } else
+ pos->ss->css_rstat_flush(pos, cpu);
+ }
+ __css_rstat_unlock(css, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ }
}
-int cgroup_rstat_init(struct cgroup *cgrp)
+int css_rstat_init(struct cgroup_subsys_state *css)
{
+ struct cgroup *cgrp = css->cgroup;
int cpu;
+ bool is_self = css_is_self(css);
+
+ if (is_self) {
+ /* the root cgrp has rstat_base_cpu preallocated */
+ if (!cgrp->rstat_base_cpu) {
+ cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
+ if (!cgrp->rstat_base_cpu)
+ return -ENOMEM;
+ }
+ } else if (css->ss->css_rstat_flush == NULL)
+ return 0;
+
+ /* the root cgrp's self css has rstat_cpu preallocated */
+ if (!css->rstat_cpu) {
+ css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
+ if (!css->rstat_cpu) {
+ if (is_self)
+ free_percpu(cgrp->rstat_base_cpu);
- /* the root cgrp has rstat_cpu preallocated */
- if (!cgrp->rstat_cpu) {
- cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
- if (!cgrp->rstat_cpu)
return -ENOMEM;
+ }
}
/* ->updated_children list is self terminated */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
+
+ if (is_self) {
+ struct cgroup_rstat_base_cpu *rstatbc;
- rstatc->updated_children = cgrp;
- u64_stats_init(&rstatc->bsync);
+ rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
+ u64_stats_init(&rstatbc->bsync);
+ }
}
return 0;
}
-void cgroup_rstat_exit(struct cgroup *cgrp)
+void css_rstat_exit(struct cgroup_subsys_state *css)
{
int cpu;
- cgroup_rstat_flush(cgrp);
+ if (!css_uses_rstat(css))
+ return;
+
+ if (!css->rstat_cpu)
+ return;
+
+ css_rstat_flush(css);
/* sanity check */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+ if (WARN_ON_ONCE(rstatc->updated_children != css) ||
WARN_ON_ONCE(rstatc->updated_next))
return;
}
- free_percpu(cgrp->rstat_cpu);
- cgrp->rstat_cpu = NULL;
+ if (css_is_self(css)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ free_percpu(cgrp->rstat_base_cpu);
+ cgrp->rstat_base_cpu = NULL;
+ }
+
+ free_percpu(css->rstat_cpu);
+ css->rstat_cpu = NULL;
}
-void __init cgroup_rstat_boot(void)
+/**
+ * ss_rstat_init - subsystem-specific rstat initialization
+ * @ss: target subsystem
+ *
+ * If @ss is NULL, the static locks associated with the base stats
+ * are initialized. If @ss is non-NULL, the subsystem-specific locks
+ * are initialized.
+ */
+int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+ init_llist_head(ss_lhead_cpu(ss, cpu));
+
+ return 0;
}
/*
@@ -301,6 +539,10 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
+ dst_bstat->ntime += src_bstat->ntime;
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -309,13 +551,18 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
+ dst_bstat->ntime -= src_bstat->ntime;
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_rstat_base_cpu *prstatbc;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -324,77 +571,90 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
- seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
- } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+ seq = __u64_stats_fetch_begin(&rstatbc->bsync);
+ delta = rstatbc->bstat;
+ } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));
- /* propagate percpu delta to global */
- delta = cur;
- cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
+ /* propagate per-cpu delta to cgroup and per-cpu global statistics */
+ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
- cgroup_base_stat_add(&rstatc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);
- /* propagate global delta to parent (unless that's root) */
+ /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = rstatbc->subtree_bstat;
+ prstatbc = cgroup_rstat_base_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
}
}
-static struct cgroup_rstat_cpu *
+static struct cgroup_rstat_base_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
- rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
- return rstatc;
+ rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
+ *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
+ return rstatbc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc,
+ struct cgroup_rstat_base_cpu *rstatbc,
unsigned long flags)
{
- u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
- cgroup_rstat_updated(cgrp, smp_processor_id());
- put_cpu_ptr(rstatc);
+ u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
+ css_rstat_updated(&cgrp->self, smp_processor_id());
+ put_cpu_ptr(rstatbc);
}
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
- rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
- case CPUTIME_USER:
case CPUTIME_NICE:
- rstatc->bstat.cputime.utime += delta_exec;
+ rstatbc->bstat.ntime += delta_exec;
+ fallthrough;
+ case CPUTIME_USER:
+ rstatbc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
case CPUTIME_IRQ:
case CPUTIME_SOFTIRQ:
- rstatc->bstat.cputime.stime += delta_exec;
+ rstatbc->bstat.cputime.stime += delta_exec;
+ break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatbc->bstat.forceidle_sum += delta_exec;
break;
+#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
/*
@@ -403,13 +663,12 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
- cputime->stime = 0;
- cputime->utime = 0;
- cputime->sum_exec_runtime = 0;
+ memset(bstat, 0, sizeof(*bstat));
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
@@ -429,35 +688,72 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
+ bstat->ntime += cpustat[CPUTIME_NICE];
}
}
+
+static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time = bstat->forceidle_sum;
+
+ do_div(forceidle_time, NSEC_PER_USEC);
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ css_rstat_flush(&cgrp->self);
+ __css_rstat_lock(&cgrp->self, -1);
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- cgroup_rstat_flush_release();
+ &bstat.cputime.utime, &bstat.cputime.stime);
+ __css_rstat_unlock(&cgrp->self, -1);
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
}
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
- "user_usec %llu\n"
- "system_usec %llu\n",
- usage, utime, stime);
+ "user_usec %llu\n"
+ "system_usec %llu\n"
+ "nice_usec %llu\n",
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
+
+ cgroup_force_idle_show(seq, &bstat);
+}
+
+/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
+BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, css_rstat_updated)
+BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
+BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/compat.c b/kernel/compat.c
index 55551989d9da..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
deleted file mode 100644
index eb701b2ac72f..000000000000
--- a/kernel/configs/android-base.config
+++ /dev/null
@@ -1,160 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVMEM is not set
-# CONFIG_FHANDLE is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_NFSD is not set
-# CONFIG_NFS_FS is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-# CONFIG_USELIB is not set
-CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_BPF=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
-CONFIG_RANDOMIZE_BASE=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
-CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
deleted file mode 100644
index eb0029c9a6a6..000000000000
--- a/kernel/configs/android-recommended.config
+++ /dev/null
@@ -1,127 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_ARM64_SW_TTBR0_PAN=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_STACKPROTECTOR_STRONG=y
-CONFIG_COMPACTION=y
-CONFIG_CPU_SW_DOMAIN_PAN=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_STRICT_KERNEL_RWX=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e9ffb0cc1eec..9f6ab7dabf67 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -1,3 +1,5 @@
+# Help: Debugging for CI systems and finding regressions
+#
# The config is based on running daily CI for enterprise Linux distros to
# seek regressions on linux-next builds on different bare-metal and virtual
# platforms. It can be used for example,
@@ -17,6 +19,7 @@ CONFIG_SYMBOLIC_ERRNAME=y
# Compile-time checks and compiler options
#
CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_FRAME_WARN=2048
CONFIG_SECTION_MISMATCH_WARN_ONLY=y
@@ -37,6 +40,12 @@ CONFIG_UBSAN_ENUM=y
CONFIG_UBSAN_SHIFT=y
CONFIG_UBSAN_UNREACHABLE=y
#
+# Networking Debugging
+#
+CONFIG_NET_DEV_REFCNT_TRACKER=y
+CONFIG_NET_NS_REFCNT_TRACKER=y
+CONFIG_DEBUG_NET=y
+#
# Memory Debugging
#
# CONFIG_DEBUG_PAGEALLOC is not set
@@ -64,7 +73,6 @@ CONFIG_DEBUG_VM=y
CONFIG_DEBUG_VM_PGFLAGS=y
CONFIG_DEBUG_VM_RB=y
CONFIG_DEBUG_VM_VMACACHE=y
-CONFIG_GENERIC_PTDUMP=y
CONFIG_KASAN=y
CONFIG_KASAN_GENERIC=y
CONFIG_KASAN_INLINE=y
@@ -75,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y
#
# Debug Oops, Lockups and Hangs
#
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
@@ -94,6 +102,7 @@ CONFIG_BUG_ON_DATA_CORRUPTION=y
#
# RCU Debugging
#
+CONFIG_RCU_EXPERT=y
CONFIG_PROVE_RCU=y
CONFIG_PROVE_RCU_LIST=y
#
@@ -103,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_FTRACE=y
CONFIG_FUNCTION_TRACER=y
+#
+# Preemption
+#
+CONFIG_DEBUG_PREEMPT=y
+CONFIG_PREEMPT=y
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..7c3924614e01
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,113 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SLAB_BUCKETS=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Sanity check userspace page table mappings.
+CONFIG_PAGE_TABLE_CHECK=y
+CONFIG_PAGE_TABLE_CHECK_ENFORCED=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_UNREACHABLE is not set
+# CONFIG_UBSAN_INTEGER_WRAP is not set
+# CONFIG_UBSAN_BOOL is not set
+# CONFIG_UBSAN_ENUM is not set
+# CONFIG_UBSAN_ALIGNMENT is not set
+
+# Sampling-based heap out-of-bounds and use-after-free detection.
+CONFIG_KFENCE=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all heap variables to zero on free to reduce stale data lifetime.
+CONFIG_INIT_ON_FREE_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe kernel stack after syscall completion to reduce stale data lifetime.
+CONFIG_KSTACK_ERASE=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Enable Kernel Control Flow Integrity.
+CONFIG_CFI=y
+# CONFIG_CFI_PERMISSIVE is not set
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
CONFIG_NET=y
CONFIG_NET_CORE=y
CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
CONFIG_PM=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..2c6e001a7284
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1,2 @@
+# Help: Enable Rust
+CONFIG_RUST=y
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
index 2f0e6bf6db2c..ffb9dcafca26 100644
--- a/kernel/configs/tiny-base.config
+++ b/kernel/configs/tiny-base.config
@@ -1 +1 @@
-CONFIG_EMBEDDED=y
+CONFIG_EXPERT=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..5dd0f0a34a73 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,11 +1,5 @@
-# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_KERNEL_GZIP is not set
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
-# CONFIG_KERNEL_LZO is not set
-# CONFIG_KERNEL_LZ4 is not set
-# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
+CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..35f48671b8d5
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,18 @@
+# Help: Debugging options for tip tree testing
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_NOINSTR_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index ff756221f112..1875a0a5047a 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
# global stuff - these enable us to allow some
# of the not so generic stuff below for xen
CONFIG_PARAVIRT=y
@@ -11,6 +13,8 @@ CONFIG_SCSI=y
CONFIG_FB=y
CONFIG_INPUT_MISC=y
CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_ZONE_DEVICE=y
CONFIG_TTY=y
# Technically not required but otherwise produces
# pretty useless systems starting from allnoconfig
@@ -34,7 +38,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
CONFIG_XEN_SCSI_FRONTEND=m
# others
CONFIG_XEN_BALLOON=y
-CONFIG_XEN_SCRUB_PAGES=y
CONFIG_XEN_DEV_EVTCHN=m
CONFIG_XEN_BLKDEV_FRONTEND=m
CONFIG_XEN_NETDEV_FRONTEND=m
@@ -46,3 +49,4 @@ CONFIG_XEN_GNTDEV=m
CONFIG_XEN_GRANT_DEV_ALLOC=m
CONFIG_SWIOTLB_XEN=y
CONFIG_XEN_PRIVCMD=m
+CONFIG_XEN_UNPOPULATED_ALLOC=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 36a98c48aedc..fb5be6e9b423 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -21,16 +23,417 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
+
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .nesting = 1,
+ .nmi_nesting = CT_NESTING_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(CT_RCU_WATCHING),
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */
+static __always_inline void rcu_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on entering RCU-tasks (dyntick-idle exit). */
+static __always_inline void rcu_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on kernel exit. */
+static __always_inline void rcu_task_trace_heavyweight_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on kernel entry. */
+static __always_inline void rcu_task_trace_heavyweight_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_task_trace_heavyweight_enter(); // Before CT state update!
+ // RCU is still watching. Better not be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu());
+ (void)ct_state_inc(offset);
+ // RCU is no longer watching.
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_task_trace_heavyweight_exit(); // After CT state update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE);
+ WRITE_ONCE(ct->nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_nesting() == 0);
+ if (ct_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_task_exit();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->nesting++;
+ return;
+ }
+ rcu_task_enter();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->nesting, 1);
+ WARN_ON_ONCE(ct_nmi_nesting());
+ WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE);
+ instrumentation_end();
+}
+
+/**
+ * ct_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_exit(void)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ instrumentation_begin();
+ /*
+ * Check for ->nmi_nesting underflow and bad CT state.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(ct_nmi_nesting() <= 0);
+ WARN_ON_ONCE(!rcu_is_watching_curr_cpu());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (ct_nmi_nesting() != 1) {
+ trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2,
+ ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */
+ ct_nmi_nesting() - 2);
+ instrumentation_end();
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(CT_RCU_WATCHING);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_task_exit();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment CT state
+ * to mark non-idle and increment ->nmi_nesting by one.
+ * Otherwise, increment ->nmi_nesting by two. This means
+ * if ->nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!rcu_is_watching_curr_cpu()) {
+
+ if (!in_nmi())
+ rcu_task_enter();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(CT_RCU_WATCHING);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_is_watching_curr_cpu()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="),
+ ct_nmi_nesting(),
+ ct_nmi_nesting() + incby, ct_rcu_watching());
+ instrumentation_end();
+ WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */
+ ct_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE(context_tracking_key);
+DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -51,66 +454,113 @@ static __always_inline void context_tracking_recursion_exit(void)
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
+ *
+ * @state: userspace context-tracking state to enter.
*
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
+
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
if (!context_tracking_recursion_enter())
return;
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- if (state == CONTEXT_USER) {
+ if (state == CT_STATE_USER) {
instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, CT_RCU_WATCHING + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires CT_RCU_WATCHING increments to be fully
+ * ordered.
+ */
+ raw_atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
-void context_tracking_enter(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
{
unsigned long flags;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
@@ -119,21 +569,34 @@ void context_tracking_enter(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_enter(state);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
+ *
+ * @state: userspace context-tracking state being exited from.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -143,32 +606,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
if (!context_tracking_recursion_enter())
return;
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
- if (state == CONTEXT_USER) {
+ ct_kernel_enter(true, CT_RCU_WATCHING - state);
+ if (state == CT_STATE_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires CT_RCU_WATCHING increments to be fully
+ * ordered.
+ */
+ raw_atomic_sub(state, &ct->state);
+ }
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_exit(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
unsigned long flags;
@@ -176,19 +671,30 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_exit(state);
+ __ct_user_exit(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
-void context_tracking_user_exit(void)
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
+ */
+void user_exit_callable(void)
{
user_exit();
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
{
static __initdata bool initialized = false;
@@ -212,12 +718,14 @@ void __init context_tracking_cpu_set(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..b674fdf96208 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
@@ -34,6 +35,9 @@
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
+#include <linux/cc_platform.h>
+#include <linux/parser.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -51,12 +55,12 @@
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
- * @cpu: CPU number
* @node: Remote CPU node; for multi-instance, do a
* single entry callback for install/remove
* @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
+ * @ap_sync_state: State for AP synchronization
* @done_up: Signal completion to the issuer of the task for cpu-up
* @done_down: Signal completion to the issuer of the task for cpu-down
*/
@@ -70,11 +74,11 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
+ atomic_t ap_sync_state;
struct completion done_up;
struct completion done_down;
#endif
@@ -275,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state)
return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}
+/* Synchronization state management */
+enum cpuhp_sync_state {
+ SYNC_STATE_DEAD,
+ SYNC_STATE_KICKED,
+ SYNC_STATE_SHOULD_DIE,
+ SYNC_STATE_ALIVE,
+ SYNC_STATE_SHOULD_ONLINE,
+ SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state: The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+ enum cpuhp_sync_state next_state)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ ktime_t now, end, start = ktime_get();
+ int sync;
+
+ end = start + 10ULL * NSEC_PER_SEC;
+
+ sync = atomic_read(st);
+ while (1) {
+ if (sync == state) {
+ if (!atomic_try_cmpxchg(st, &sync, next_state))
+ continue;
+ return true;
+ }
+
+ now = ktime_get();
+ if (now > end) {
+ /* Timeout. Leave the state unchanged */
+ return false;
+ } else if (now - start < NSEC_PER_MSEC) {
+ /* Poll for one millisecond */
+ arch_cpuhp_sync_state_poll();
+ } else {
+ usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC);
+ }
+ sync = atomic_read(st);
+ }
+ return true;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+ cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+ do {
+ /* CPU can have reported dead already. Don't overwrite that! */
+ if (sync == SYNC_STATE_DEAD)
+ break;
+ } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+ if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+ /* CPU reached dead state. Invoke the cleanup function */
+ arch_cpuhp_cleanup_dead_cpu(cpu);
+ return;
+ }
+
+ /* No further action possible. Emit message and give up. */
+ pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+ /* Wait for the control CPU to release it. */
+ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+ cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+again:
+ switch (sync) {
+ case SYNC_STATE_DEAD:
+ /* CPU is properly dead */
+ break;
+ case SYNC_STATE_KICKED:
+ /* CPU did not come up in previous attempt */
+ break;
+ case SYNC_STATE_ALIVE:
+ /* CPU is stuck cpuhp_ap_sync_alive(). */
+ break;
+ default:
+ /* CPU failed to report online or dead and is in limbo state. */
+ return false;
+ }
+
+ /* Prepare for booting */
+ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+ goto again;
+
+ return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+ return 0;
+
+ if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+ pr_err("CPU%u failed to report alive state\n", cpu);
+ ret = -EIO;
+ }
+
+ /* Let the architecture cleanup the kick alive mechanics. */
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -304,6 +484,8 @@ static int cpu_hotplug_disabled;
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
+static bool cpu_hotplug_offline_disabled __ro_after_init;
+
void cpus_read_lock(void)
{
percpu_down_read(&cpu_hotplug_lock);
@@ -345,6 +527,7 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);
#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
@@ -363,6 +546,14 @@ static void lockdep_release_cpus_lock(void)
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
+/* Declare CPU offlining not supported */
+void cpu_hotplug_disable_offlining(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_offline_disabled = true;
+ cpu_maps_update_done();
+}
+
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@ -412,7 +603,10 @@ static void lockdep_release_cpus_lock(void)
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
+
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
void __init cpu_smt_disable(bool force)
{
@@ -426,16 +620,33 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
+ cpu_smt_num_threads = 1;
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code.
*/
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads)
{
- if (!topology_smt_supported())
+ WARN_ON(!num_threads || (num_threads > max_threads));
+
+ if (max_threads == 1)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+ cpu_smt_max_threads = max_threads;
+
+ /*
+ * If SMT has been disabled via the kernel command line or SMT is
+ * not supported, set cpu_smt_num_threads to 1 for consistency.
+ * If enabled, take the architecture requested number of threads
+ * to bring up into account.
+ */
+ if (cpu_smt_control != CPU_SMT_ENABLED)
+ cpu_smt_num_threads = 1;
+ else if (num_threads < cpu_smt_num_threads)
+ cpu_smt_num_threads = num_threads;
}
static int __init smt_cmdline_disable(char *str)
@@ -445,9 +656,31 @@ static int __init smt_cmdline_disable(char *str)
}
early_param("nosmt", smt_cmdline_disable);
-static inline bool cpu_smt_allowed(unsigned int cpu)
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+ return topology_smt_thread_allowed(cpu);
+#else
+ return true;
+#endif
+}
+
+static inline bool cpu_bootable(unsigned int cpu)
{
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ return true;
+
+ /* All CPUs are bootable if controls are not configured */
+ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+ return true;
+
+ /* All CPUs are bootable if CPU is not SMT capable */
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return true;
if (topology_is_primary_thread(cpu))
@@ -462,19 +695,20 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
-/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
#else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
@@ -485,14 +719,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
st->target = target;
st->single = false;
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
@@ -519,8 +754,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
}
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -540,22 +775,23 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
wait_for_ap_thread(st, st->bringup);
}
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
return ret;
}
-static int bringup_wait_for_ap(unsigned int cpu)
+static int bringup_wait_for_ap_online(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -571,43 +807,99 @@ static int bringup_wait_for_ap(unsigned int cpu)
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
* CPU marked itself as booted_once in notify_cpu_starting() so the
- * cpu_smt_allowed() check will now return false if this is not the
+ * cpu_bootable() check will now return false if this is not the
* primary sibling.
*/
- if (!cpu_smt_allowed(cpu))
+ if (!cpu_bootable(cpu))
return -ECANCELED;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
+{
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
+}
+
+static int cpuhp_bringup_ap(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
+
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
- return cpuhp_kick_ap(st, st->target);
-}
+ return cpuhp_kick_ap(cpu, st, st->target);
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
+}
+#else
static int bringup_cpu(unsigned int cpu)
{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle = idle_thread_get(cpu);
int ret;
- /*
- * Reset stale stack state from the last time this CPU was online.
- */
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
/*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
- * Prevent irq alloc/free across the bringup.
+ *
+ * Prevent irq alloc/free across the bringup by acquiring the
+ * sparse irq lock. Hold it until the upcoming CPU completes the
+ * startup in cpuhp_online_idle() which allows to avoid
+ * intermediate synchronization points in the architecture code.
*/
irq_lock_sparse();
- /* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
- irq_unlock_sparse();
if (ret)
- return ret;
- return bringup_wait_for_ap(cpu);
+ goto out_unlock;
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
}
+#endif
static int finish_cpu(unsigned int cpu)
{
@@ -615,12 +907,13 @@ static int finish_cpu(unsigned int cpu)
struct mm_struct *mm = idle->active_mm;
/*
- * idle_task_exit() will have switched to &init_mm, now
- * clean up any remaining active_mm state.
+ * sched_force_init_mm() ensured the use of &init_mm,
+ * drop that refcount now that the CPU has stopped.
*/
- if (mm != &init_mm)
- idle->active_mm = &init_mm;
- mmdrop(mm);
+ WARN_ON(mm != &init_mm);
+ idle->active_mm = NULL;
+ mmdrop_lazy_tlb(mm);
+
return 0;
}
@@ -660,21 +953,51 @@ static bool cpuhp_next_state(bool bringup,
return true;
}
-static int cpuhp_invoke_callback_range(bool bringup,
- unsigned int cpu,
- struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
enum cpuhp_state state;
- int err = 0;
+ int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
- if (err)
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
break;
+ }
}
- return err;
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -703,7 +1026,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
@@ -714,15 +1037,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
/*
* The cpu hotplug threads manage the bringup and teardown of the cpus
*/
-static void cpuhp_create(unsigned int cpu)
-{
- struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-
- init_completion(&st->done_up);
- init_completion(&st->done_down);
- st->cpu = cpu;
-}
-
static int cpuhp_should_run(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
@@ -874,7 +1188,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
- ret = cpuhp_kick_ap(st, st->target);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
@@ -882,63 +1196,29 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
static struct smp_hotplug_thread cpuhp_threads = {
.store = &cpuhp_state.thread,
- .create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
-void __init cpuhp_threads_init(void)
+static __init void cpuhp_init_state(void)
{
- BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
- kthread_unpark(this_cpu_read(cpuhp_state.thread));
+ struct cpuhp_cpu_state *st;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ st = per_cpu_ptr(&cpuhp_state, cpu);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
+ }
}
-/*
- *
- * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
- * protected region.
- *
- * The operation is still serialized against concurrent CPU hotplug via
- * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
- * serialized against other hotplug related activity like adding or
- * removing of state callbacks and state instances, which invoke either the
- * startup or the teardown callback of the affected state.
- *
- * This is required for subsystems which are unfixable vs. CPU hotplug and
- * evade lock inversion problems by scheduling work which has to be
- * completed _before_ cpu_up()/_cpu_down() returns.
- *
- * Don't even think about adding anything to this for any new code or even
- * drivers. It's only purpose is to keep existing lock order trainwrecks
- * working.
- *
- * For cpu_down() there might be valid reasons to finish cleanups which are
- * not required to be done under cpu_hotplug_lock, but that's a different
- * story and would be not invoked via this.
- */
-static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
+void __init cpuhp_threads_init(void)
{
- /*
- * cpusets delegate hotplug operations to a worker to "solve" the
- * lock order problems. Wait for the worker, but only if tasks are
- * _not_ frozen (suspend, hibernate) as that would wait forever.
- *
- * The wait is required because otherwise the hotplug operation
- * returns with inconsistent state, which could even be observed in
- * user space when a new CPU is brought up. The CPU plug uevent
- * would be delivered and user space reacting on it would fail to
- * move tasks to the newly plugged CPU up to the point where the
- * work has finished because up to that point the newly plugged CPU
- * is not assignable in cpusets/cgroups. On unplug that's not
- * necessarily a visible issue, but it is still inconsistent state,
- * which is the real problem which needs to be "fixed". This can't
- * prevent the transient state between scheduling the work and
- * returning from waiting for it.
- */
- if (!tasks_frozen)
- cpuset_wait_for_hotplug();
+ cpuhp_init_state();
+ BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
+ kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -993,7 +1273,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -1006,18 +1285,11 @@ static int take_cpu_down(void *_param)
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
- /* Invoke the former CPU_DYING callbacks */
- ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
/*
- * DYING must not fail!
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
- /* Give up timekeeping duties */
- tick_handover_do_timer();
- /* Remove CPU from timer broadcasting */
- tick_offline_cpu(cpu);
/* Park the stopper thread */
stop_machine_park(cpu);
return 0;
@@ -1037,9 +1309,6 @@ static int takedown_cpu(unsigned int cpu)
*/
irq_lock_sparse();
- /*
- * So now all preempt/rcu users must observe !cpu_active().
- */
err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
/* CPU refused to die */
@@ -1067,8 +1336,17 @@ static int takedown_cpu(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
- tick_cleanup_dead_cpu(cpu);
+ cpuhp_bp_sync_dead(cpu);
+
+ lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));
+
+ /*
+ * Callbacks must be re-integrated right away to the RCU state machine.
+ * Otherwise an RCU callback could block a further teardown function
+ * waiting for its completion.
+ */
rcutree_migrate_callbacks(cpu);
+
return 0;
}
@@ -1084,10 +1362,11 @@ void cpuhp_report_idle_dead(void)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
BUG_ON(st->state != CPUHP_AP_OFFLINE);
- rcu_report_dead(smp_processor_id());
+ tick_assert_timekeeping_handover();
+ rcutree_report_cpu_dead();
st->state = CPUHP_AP_IDLE_DEAD;
/*
- * We cannot call complete after rcu_report_dead() so we delegate it
+ * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
* to an online cpu.
*/
smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1106,7 +1385,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
@@ -1133,7 +1412,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
@@ -1164,7 +1443,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
@@ -1173,21 +1452,46 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
out:
cpus_write_unlock();
- /*
- * Do post unplug cleanup. This is still protected against
- * concurrent CPU hotplug via cpu_add_remove_lock.
- */
- lockup_detector_cleanup();
arch_smt_update();
- cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
+struct cpu_down_work {
+ unsigned int cpu;
+ enum cpuhp_state target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+ struct cpu_down_work *work = arg;
+
+ return _cpu_down(work->cpu, 0, work->target);
+}
+
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
+ /*
+ * If the platform does not support hotplug, report it explicitly to
+ * differentiate it from a transient offlining failure.
+ */
+ if (cpu_hotplug_offline_disabled)
+ return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
- return _cpu_down(cpu, 0, target);
+
+ /*
+ * Ensure that the control task does not run on the to be offlined
+ * CPU to prevent a deadlock against cfs_b->period_timer.
+ * Also keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
+ */
+ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+ if (cpu != work.cpu)
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+ }
+ return -EBUSY;
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1284,16 +1588,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
- rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -1309,8 +1611,10 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
/*
- * Unpart the stopper thread before we start the idle loop (and start
+ * Unpark the stopper thread before we start the idle loop (and start
* scheduling); this ensures the stopper task is always available.
*/
stop_machine_unpark(smp_processor_id());
@@ -1347,11 +1651,17 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = PTR_ERR(idle);
goto out;
}
+
+ /*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
}
cpuhp_tasks_frozen = tasks_frozen;
- cpuhp_set_state(st, target);
+ cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1376,7 +1686,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
out:
cpus_write_unlock();
arch_smt_update();
- cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
@@ -1387,9 +1696,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
-#if defined(CONFIG_IA64)
- pr_err("please check additional_cpus= boot parameter\n");
-#endif
return -EINVAL;
}
@@ -1403,7 +1709,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
- if (!cpu_smt_allowed(cpu)) {
+ if (!cpu_bootable(cpu)) {
err = -EPERM;
goto out;
}
@@ -1466,18 +1772,125 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
return 0;
}
-void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+ enum cpuhp_state target)
{
unsigned int cpu;
- for_each_present_cpu(cpu) {
- if (num_online_cpus() >= setup_max_cpus)
+ for_each_cpu(cpu, mask) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+ /*
+ * If this failed then cpu_up() might have only
+ * rolled back to CPUHP_BP_KICK_AP for the final
+ * online. Clean it up. NOOP if already rolled back.
+ */
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+ }
+
+ if (!--ncpus)
break;
- if (!cpu_online(cpu))
- cpu_up(cpu, CPUHP_ONLINE);
}
}
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+ return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+#ifdef CONFIG_HOTPLUG_SMT
+static inline bool cpuhp_smt_aware(void)
+{
+ return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
+#else
+static inline bool cpuhp_smt_aware(void)
+{
+ return false;
+}
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_none_mask;
+}
+#endif
+
+bool __weak arch_cpuhp_init_parallel_bringup(void)
+{
+ return true;
+}
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+ const struct cpumask *mask = cpu_present_mask;
+
+ if (__cpuhp_parallel_bringup)
+ __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+ if (!__cpuhp_parallel_bringup)
+ return false;
+
+ if (cpuhp_smt_aware()) {
+ const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+ static struct cpumask tmp_mask __initdata;
+
+ /*
+ * X86 requires to prevent that SMT siblings stopped while
+ * the primary thread does a microcode update for various
+ * reasons. Bring the primary threads up first.
+ */
+ cpumask_and(&tmp_mask, mask, pmask);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+ /* Account for the online CPUs */
+ ncpus -= num_online_cpus();
+ if (!ncpus)
+ return true;
+ /* Create the mask for secondary CPUs */
+ cpumask_andnot(&tmp_mask, mask, pmask);
+ mask = &tmp_mask;
+ }
+
+ /* Bring the not-yet started CPUs up */
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+ return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int max_cpus)
+{
+ if (!max_cpus)
+ return;
+
+ /* Try parallel bringup optimization if enabled */
+ if (cpuhp_bringup_cpus_parallel(max_cpus))
+ return;
+
+ /* Full per CPU serialized bringup */
+ cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
+}
+
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
@@ -1488,8 +1901,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1502,8 +1915,8 @@ int freeze_secondary_cpus(int primary)
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == primary)
+ for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
+ if (!cpu_online(cpu) || cpu == primary)
continue;
if (pm_wakeup_pending()) {
@@ -1654,10 +2067,10 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- [CPUHP_PERF_PREPARE] = {
- .name = "perf:prepare",
- .startup.single = perf_event_init_cpu,
- .teardown.single = perf_event_exit_cpu,
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
},
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
@@ -1667,7 +2080,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -1679,11 +2092,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = relay_prepare_cpu,
.teardown.single = NULL,
},
- [CPUHP_SLAB_PREPARE] = {
- .name = "slab:prepare",
- .startup.single = slab_prepare_cpu,
- .teardown.single = slab_dead_cpu,
- },
[CPUHP_RCUTREE_PREP] = {
.name = "RCU/tree:prepare",
.startup.single = rcutree_prepare_cpu,
@@ -1699,13 +2107,38 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
- /* Kicks the plugged cpu into life */
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+ /*
+ * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+ * the next step will release it.
+ */
+ [CPUHP_BP_KICK_AP] = {
+ .name = "cpu:kick_ap",
+ .startup.single = cpuhp_kick_ap_alive,
+ },
+
+ /*
+ * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+ * releases it for the complete bringup.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = cpuhp_bringup_ap,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#else
+ /*
+ * All-in-one CPU bringup state which includes the kick alive.
+ */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = finish_cpu,
.cant_stop = true,
},
+#endif
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
@@ -1734,6 +2167,16 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = hrtimers_cpu_starting,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+ [CPUHP_AP_TICK_DYING] = {
+ .name = "tick:dying",
+ .startup.single = NULL,
+ .teardown.single = tick_cpu_dying,
+ },
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
@@ -1782,6 +2225,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
@@ -2017,7 +2465,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* The caller needs to hold cpus read locked while calling this function.
* Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2040,7 +2488,7 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
ret = cpuhp_store_callbacks(state, name, startup, teardown,
multi_instance);
- dynstate = state == CPUHP_AP_ONLINE_DYN;
+ dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN;
if (ret > 0 && dynstate) {
state = ret;
ret = 0;
@@ -2071,8 +2519,8 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
out:
mutex_unlock(&cpuhp_state_mutex);
/*
- * If the requested state is CPUHP_AP_ONLINE_DYN, return the
- * dynamically allocated state in case of success.
+ * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN,
+ * return the dynamically allocated state in case of success.
*/
if (!ret && dynstate)
return state;
@@ -2217,6 +2665,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+ /*
+ * Disable can be called with CPU_SMT_ENABLED when changing
+ * from a higher to lower number of SMT threads per core.
+ */
+ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2241,6 +2695,14 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
return ret;
}
+/* Check if the core a CPU belongs to is online */
+#if !defined(topology_is_core_online)
+static inline bool topology_is_core_online(unsigned int cpu)
+{
+ return true;
+}
+#endif
+
int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
@@ -2251,6 +2713,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+ if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu))
+ continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
@@ -2304,8 +2768,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
@@ -2390,7 +2856,6 @@ static struct attribute *cpuhp_cpu_attrs[] = {
static const struct attribute_group cpuhp_cpu_attr_group = {
.attrs = cpuhp_cpu_attrs,
.name = "hotplug",
- NULL
};
static ssize_t states_show(struct device *dev,
@@ -2422,25 +2887,23 @@ static struct attribute *cpuhp_cpu_root_attrs[] = {
static const struct attribute_group cpuhp_cpu_root_attr_group = {
.attrs = cpuhp_cpu_root_attrs,
.name = "hotplug",
- NULL
};
#ifdef CONFIG_HOTPLUG_SMT
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+ if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+ return threads >= 1 && threads <= cpu_smt_max_threads;
+ return threads == 1 || threads == cpu_smt_max_threads;
+}
+
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- int ctrlval, ret;
-
- if (sysfs_streq(buf, "on"))
- ctrlval = CPU_SMT_ENABLED;
- else if (sysfs_streq(buf, "off"))
- ctrlval = CPU_SMT_DISABLED;
- else if (sysfs_streq(buf, "forceoff"))
- ctrlval = CPU_SMT_FORCE_DISABLED;
- else
- return -EINVAL;
+ int ctrlval, ret, num_threads, orig_threads;
+ bool force_off;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2448,21 +2911,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
+ if (sysfs_streq(buf, "on")) {
+ ctrlval = CPU_SMT_ENABLED;
+ num_threads = cpu_smt_max_threads;
+ } else if (sysfs_streq(buf, "off")) {
+ ctrlval = CPU_SMT_DISABLED;
+ num_threads = 1;
+ } else if (sysfs_streq(buf, "forceoff")) {
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ num_threads = 1;
+ } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+ if (num_threads == 1)
+ ctrlval = CPU_SMT_DISABLED;
+ else if (cpu_smt_num_threads_valid(num_threads))
+ ctrlval = CPU_SMT_ENABLED;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
- if (ctrlval != cpu_smt_control) {
- switch (ctrlval) {
- case CPU_SMT_ENABLED:
- ret = cpuhp_smt_enable();
- break;
- case CPU_SMT_DISABLED:
- case CPU_SMT_FORCE_DISABLED:
- ret = cpuhp_smt_disable(ctrlval);
- break;
- }
- }
+ orig_threads = cpu_smt_num_threads;
+ cpu_smt_num_threads = num_threads;
+
+ force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+ if (num_threads > orig_threads)
+ ret = cpuhp_smt_enable();
+ else if (num_threads < orig_threads || force_off)
+ ret = cpuhp_smt_disable(ctrlval);
unlock_device_hotplug();
return ret ? ret : count;
@@ -2490,7 +2971,18 @@ static ssize_t control_show(struct device *dev,
{
const char *state = smt_states[cpu_smt_control];
- return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+#ifdef CONFIG_HOTPLUG_SMT
+ /*
+ * If SMT is enabled but not all threads are enabled then show the
+ * number of threads. If all threads are enabled show "on". Otherwise
+ * show the state name.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED &&
+ cpu_smt_num_threads != cpu_smt_max_threads)
+ return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
+ return sysfs_emit(buf, "%s\n", state);
}
static ssize_t control_store(struct device *dev, struct device_attribute *attr,
@@ -2503,7 +2995,7 @@ static DEVICE_ATTR_RW(control);
static ssize_t active_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
+ return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);
@@ -2516,27 +3008,37 @@ static struct attribute *cpuhp_smt_attrs[] = {
static const struct attribute_group cpuhp_smt_attr_group = {
.attrs = cpuhp_smt_attrs,
.name = "smt",
- NULL
};
static int __init cpu_smt_sysfs_init(void)
{
- return sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_smt_attr_group);
+ struct device *dev_root;
+ int ret = -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+ put_device(dev_root);
+ }
+ return ret;
}
static int __init cpuhp_sysfs_init(void)
{
+ struct device *dev_root;
int cpu, ret;
ret = cpu_smt_sysfs_init();
if (ret)
return ret;
- ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_cpu_root_attr_group);
- if (ret)
- return ret;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+ put_device(dev_root);
+ if (ret)
+ return ret;
+ }
for_each_possible_cpu(cpu) {
struct device *dev = get_cpu_device(cpu);
@@ -2581,16 +3083,22 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-struct cpumask __cpu_possible_mask __read_mostly
+struct cpumask __cpu_possible_mask __ro_after_init
= {CPU_BITS_ALL};
+unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
#else
-struct cpumask __cpu_possible_mask __read_mostly;
+struct cpumask __cpu_possible_mask __ro_after_init;
+unsigned int __num_possible_cpus __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
+struct cpumask __cpu_enabled_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_enabled_mask);
+
struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);
@@ -2611,11 +3119,7 @@ void init_cpu_present(const struct cpumask *src)
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(&__cpu_possible_mask, src);
-}
-
-void init_cpu_online(const struct cpumask *src)
-{
- cpumask_copy(&__cpu_online_mask, src);
+ __num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
}
void set_cpu_online(unsigned int cpu, bool online)
@@ -2640,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online)
}
/*
+ * This should be marked __init, but there is a boatload of call sites
+ * which need to be fixed up to do so. Sigh...
+ */
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+ if (possible) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus++;
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus--;
+ }
+}
+
+/*
* Activate the first processor.
*/
void __init boot_cpu_init(void)
@@ -2664,13 +3183,46 @@ void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+ atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
+#ifdef CONFIG_CPU_MITIGATIONS
/*
- * These are used for a global "mitigations=" cmdline option for toggling
- * optional CPU mitigations.
+ * All except the cross-thread attack vector are mitigated by default.
+ * Cross-thread mitigation often requires disabling SMT which is expensive
+ * so cross-thread mitigations are only partially enabled by default.
+ *
+ * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
+ * present.
+ */
+static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
+ [CPU_MITIGATE_USER_KERNEL] = true,
+ [CPU_MITIGATE_USER_USER] = true,
+ [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
+ [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
+};
+
+bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
+{
+ if (v < NR_CPU_ATTACK_VECTORS)
+ return attack_vectors[v];
+
+ WARN_ONCE(1, "Invalid attack vector %d\n", v);
+ return false;
+}
+
+/*
+ * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
+ * be combined with attack-vector disables which follow them.
+ *
+ * Examples:
+ * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
+ * mitigations=auto,nosmt,no_guest_host,no_guest_guest
+ *
+ * mitigations=off is equivalent to disabling all attack vectors.
*/
enum cpu_mitigations {
CPU_MITIGATIONS_OFF,
@@ -2678,24 +3230,99 @@ enum cpu_mitigations {
CPU_MITIGATIONS_AUTO_NOSMT,
};
-static enum cpu_mitigations cpu_mitigations __ro_after_init =
- CPU_MITIGATIONS_AUTO;
+enum {
+ NO_USER_KERNEL,
+ NO_USER_USER,
+ NO_GUEST_HOST,
+ NO_GUEST_GUEST,
+ NO_CROSS_THREAD,
+ NR_VECTOR_PARAMS,
+};
+
+enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
+static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static const match_table_t global_mitigations = {
+ { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"},
+ { CPU_MITIGATIONS_AUTO, "auto"},
+ { CPU_MITIGATIONS_OFF, "off"},
+};
+
+static const match_table_t vector_mitigations = {
+ { NO_USER_KERNEL, "no_user_kernel"},
+ { NO_USER_USER, "no_user_user"},
+ { NO_GUEST_HOST, "no_guest_host"},
+ { NO_GUEST_GUEST, "no_guest_guest"},
+ { NO_CROSS_THREAD, "no_cross_thread"},
+ { NR_VECTOR_PARAMS, NULL},
+};
+
+static int __init mitigations_parse_global_opt(char *arg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
+ const char *pattern = global_mitigations[i].pattern;
+
+ if (!strncmp(arg, pattern, strlen(pattern))) {
+ cpu_mitigations = global_mitigations[i].token;
+ return strlen(pattern);
+ }
+ }
+
+ return 0;
+}
static int __init mitigations_parse_cmdline(char *arg)
{
- if (!strcmp(arg, "off"))
- cpu_mitigations = CPU_MITIGATIONS_OFF;
- else if (!strcmp(arg, "auto"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO;
- else if (!strcmp(arg, "auto,nosmt"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
- else
- pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
- arg);
+ char *s, *p;
+ int len;
+
+ len = mitigations_parse_global_opt(arg);
+
+ if (cpu_mitigations_off()) {
+ memset(attack_vectors, 0, sizeof(attack_vectors));
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ } else if (cpu_mitigations_auto_nosmt()) {
+ smt_mitigations = SMT_MITIGATIONS_ON;
+ }
+
+ p = arg + len;
+
+ if (!*p)
+ return 0;
+
+ /* Attack vector controls may come after the ',' */
+ if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg);
+ return 0;
+ }
+
+ while ((s = strsep(&p, ",")) != NULL) {
+ switch (match_token(s, vector_mitigations, NULL)) {
+ case NO_USER_KERNEL:
+ attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
+ break;
+ case NO_USER_USER:
+ attack_vectors[CPU_MITIGATE_USER_USER] = false;
+ break;
+ case NO_GUEST_HOST:
+ attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
+ break;
+ case NO_GUEST_GUEST:
+ attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
+ break;
+ case NO_CROSS_THREAD:
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ break;
+ default:
+ pr_crit("Unsupported mitigations options %s\n", s);
+ return 0;
+ }
+ }
return 0;
}
-early_param("mitigations", mitigations_parse_cmdline);
/* mitigations=off */
bool cpu_mitigations_off(void)
@@ -2710,3 +3337,11 @@ bool cpu_mitigations_auto_nosmt(void)
return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
+#else
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
+ return 0;
+}
+#endif
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 246efc74e3f3..7481fbb947d3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- /*
- * This introduces a RCU read critical section, which could be
- * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
- * this.
- */
- rcu_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- rcu_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -182,7 +173,7 @@ int cpu_cluster_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
#ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
+static int cpu_pm_suspend(void *data)
{
int ret;
@@ -194,20 +185,24 @@ static int cpu_pm_suspend(void)
return ret;
}
-static void cpu_pm_resume(void)
+static void cpu_pm_resume(void *data)
{
cpu_cluster_pm_exit();
cpu_pm_exit();
}
-static struct syscore_ops cpu_pm_syscore_ops = {
+static const struct syscore_ops cpu_pm_syscore_ops = {
.suspend = cpu_pm_suspend,
.resume = cpu_pm_resume,
};
+static struct syscore cpu_pm_syscore = {
+ .ops = &cpu_pm_syscore_ops,
+};
+
static int cpu_pm_init(void)
{
- register_syscore_ops(&cpu_pm_syscore_ops);
+ register_syscore(&cpu_pm_syscore);
return 0;
}
core_initcall(cpu_pm_init);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 256cf6db573c..99dac1aa972a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -4,490 +4,690 @@
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/buildid.h>
-#include <linux/crash_core.h>
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
+#include <linux/delay.h>
+#include <linux/panic.h>
#include <asm/page.h>
#include <asm/sections.h>
#include <crypto/sha1.h>
-/* vmcoreinfo stuff */
-unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
-u32 *vmcoreinfo_note;
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
-/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-static unsigned char *vmcoreinfo_data_safecopy;
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
*/
+#define CMA_DMA_TIMEOUT_SEC 10
+#ifdef CONFIG_CRASH_DUMP
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
{
- char *cur = cmdline, *tmp;
+ struct page *vmcoreinfo_page;
+ void *safecopy;
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
+ if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+ return 0;
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
+ /*
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
+ */
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
+
+ return 0;
+}
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- } else
- pr_info("crashkernel size resulted in zero bytes\n");
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in make_task_dead() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
return 0;
}
+int kexec_crash_loaded(void)
+{
+ return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
*/
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+void __noclone __crash_kexec(struct pt_regs *regs)
{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
+ /* Take the kexec_lock here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (kexec_trylock()) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
+ machine_kexec(kexec_crash_image);
+ }
+ kexec_unlock();
}
+}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
+{
+ if (panic_try_start()) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
+
+ /*
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
+ */
+ panic_reset();
}
+}
- return 0;
+static inline resource_size_t crash_resource_size(const struct resource *res)
+{
+ return !res->end ? 0 : resource_size(res);
}
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
+
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+ void **addr, unsigned long *sz)
{
- char *cur = cmdline;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo ELF note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two ELF headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each possible CPU */
+ for_each_possible_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
}
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (need_kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long) _text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
}
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+#ifdef CONFIG_KEXEC_FILE
+ kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
+ phdr++;
}
+ *addr = buf;
+ *sz = elf_sz;
return 0;
}
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
+/**
+ * crash_exclude_mem_range - exclude a mem range for existing ranges
+ * @mem: mem->range contains an array of ranges sorted in ascending order
+ * @mstart: the start of to-be-excluded range
+ * @mend: the start of to-be-excluded range
+ *
+ * If you are unsure if a range split will happen, to avoid function call
+ * failure because of -ENOMEM, always make sure
+ * mem->max_nr_ranges == mem->nr_ranges + 1
+ * before calling the function each time.
+ *
+ * returns 0 if a memory range is excluded successfully
+ * return -ENOMEM if mem->ranges doesn't have space to hold split ranges
+ */
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
+ int i;
+ unsigned long long start, end, p_start, p_end;
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
+
+ if (p_start > end)
+ continue;
+
+ /*
+ * Because the memory ranges in mem->ranges are stored in
+ * ascending order, when we detect `p_end < start`, we can
+ * immediately exit the for loop, as the subsequent memory
+ * ranges will definitely be outside the range we are looking
+ * for.
+ */
+ if (p_end < start)
+ break;
+
+ /* Truncate any area outside of range */
+ if (p_start < start)
+ p_start = start;
+ if (p_end > end)
+ p_end = end;
+
+ /* Found completely overlapping range */
+ if (p_start == start && p_end == end) {
+ memmove(&mem->ranges[i], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+ i--;
+ mem->nr_ranges--;
+ } else if (p_start > start && p_end < end) {
+ /* Split original range */
+ if (mem->nr_ranges >= mem->max_nr_ranges)
+ return -ENOMEM;
+
+ memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
+ mem->ranges[i].end = p_start - 1;
+ mem->ranges[i + 1].start = p_end + 1;
+ mem->ranges[i + 1].end = end;
+
+ i++;
+ mem->nr_ranges++;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
+ else
+ mem->ranges[i].start = p_end + 1;
}
- if (!ck_cmdline)
- return NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crash_exclude_mem_range);
- return ck_cmdline;
+ssize_t crash_get_memory_size(void)
+{
+ ssize_t size = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
+
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
+
+ kexec_unlock();
+ return size;
}
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
+ struct resource *ram_res;
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
+
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ old_res->end = ram_res->start - 1;
+ }
+
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
+
+ return 0;
+}
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long old_size, low_size;
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+ if (!kexec_trylock())
+ return -EBUSY;
- if (!ck_cmdline)
- return -EINVAL;
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
- ck_cmdline += strlen(name);
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
/*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
*/
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
+
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
+ }
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
+unlock:
+ kexec_unlock();
+ return ret;
}
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+void crash_save_cpu(struct pt_regs *regs, int cpu)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.common.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
}
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+
+
+static int __init crash_notes_memory_init(void)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
}
+subsys_initcall(crash_notes_memory_init);
+
+#endif /*CONFIG_CRASH_DUMP*/
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
/*
- * Add a dummy early_param handler to mark crashkernel= as a known command line
- * parameter and suppress incorrect warnings in init/main.c.
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
*/
-static int __init parse_crashkernel_dummy(char *arg)
-{
- return 0;
-}
-early_param("crashkernel", parse_crashkernel_dummy);
+static DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
- void *data, size_t data_len)
+/*
+ * This routine utilized when the crash_hotplug sysfs node is read.
+ * It reflects the kernel's ability/permission to update the kdump
+ * image directly.
+ */
+int crash_check_hotplug_support(void)
{
- struct elf_note *note = (struct elf_note *)buf;
-
- note->n_namesz = strlen(name) + 1;
- note->n_descsz = data_len;
- note->n_type = type;
- buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
- memcpy(buf, name, note->n_namesz);
- buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
- memcpy(buf, data, data_len);
- buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
-
- return buf;
-}
+ int rc = 0;
+
+ crash_hotplug_lock();
+ /* Obtain lock while reading crash information */
+ if (!kexec_trylock()) {
+ if (!kexec_in_progress)
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
+ crash_hotplug_unlock();
+ return 0;
+ }
+ if (kexec_crash_image) {
+ rc = kexec_crash_image->hotplug_support;
+ }
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
-void final_note(Elf_Word *buf)
-{
- memset(buf, 0, sizeof(struct elf_note));
+ return rc;
}
-static void update_vmcoreinfo_note(void)
+/*
+ * To accurately reflect hot un/plug changes of CPU and Memory resources
+ * (including onling and offlining of those resources), the relevant
+ * kexec segments must be updated with latest CPU and Memory resources.
+ *
+ * Architectures must ensure two things for all segments that need
+ * updating during hotplug events:
+ *
+ * 1. Segments must be large enough to accommodate a growing number of
+ * resources.
+ * 2. Exclude the segments from SHA verification.
+ *
+ * For example, on most architectures, the elfcorehdr (which is passed
+ * to the crash kernel via the elfcorehdr= parameter) must include the
+ * new list of CPUs and memory. To make changes to the elfcorehdr, it
+ * should be large enough to permit a growing number of CPU and Memory
+ * resources. One can estimate the elfcorehdr memory size based on
+ * NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES. The elfcorehdr is
+ * excluded from SHA verification by default if the architecture
+ * supports crash hotplug.
+ */
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg)
{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
+ struct kimage *image;
+
+ crash_hotplug_lock();
+ /* Obtain lock while changing crash information */
+ if (!kexec_trylock()) {
+ if (!kexec_in_progress)
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
+ crash_hotplug_unlock();
return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
+ }
-void crash_update_vmcoreinfo_safecopy(void *ptr)
-{
- if (ptr)
- memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+ /* Check kdump is not loaded */
+ if (!kexec_crash_image)
+ goto out;
- vmcoreinfo_data_safecopy = ptr;
-}
+ image = kexec_crash_image;
-void crash_save_vmcoreinfo(void)
-{
- if (!vmcoreinfo_note)
- return;
+ /* Check that kexec segments update is permitted */
+ if (!image->hotplug_support)
+ goto out;
- /* Use the safe copy to generate vmcoreinfo note if have */
- if (vmcoreinfo_data_safecopy)
- vmcoreinfo_data = vmcoreinfo_data_safecopy;
+ if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+ hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+ pr_debug("hp_action %u, cpu %u\n", hp_action, cpu);
+ else
+ pr_debug("hp_action %u\n", hp_action);
- vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
- update_vmcoreinfo_note();
-}
+ /*
+ * The elfcorehdr_index is set to -1 when the struct kimage
+ * is allocated. Find the segment containing the elfcorehdr,
+ * if not already found.
+ */
+ if (image->elfcorehdr_index < 0) {
+ unsigned long mem;
+ unsigned char *ptr;
+ unsigned int n;
+
+ for (n = 0; n < image->nr_segments; n++) {
+ mem = image->segment[n].mem;
+ ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (ptr) {
+ /* The segment containing elfcorehdr */
+ if (memcmp(ptr, ELFMAG, SELFMAG) == 0)
+ image->elfcorehdr_index = (int)n;
+ kunmap_local(ptr);
+ }
+ }
+ }
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
+ if (image->elfcorehdr_index < 0) {
+ pr_err("unable to locate elfcorehdr segment");
+ goto out;
+ }
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
+ /* Needed in order for the segments to be updated */
+ arch_kexec_unprotect_crashkres();
- r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
+ /* Differentiate between normal load and hotplug update */
+ image->hp_action = hp_action;
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+ /* Now invoke arch-specific update handler */
+ arch_crash_handle_hotplug_event(image, arg);
- vmcoreinfo_size += r;
-}
+ /* No longer handling a hotplug event */
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_updated = true;
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
+ /* Change back to read-only */
+ arch_kexec_protect_crashkres();
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
- return __pa(vmcoreinfo_note);
+ /* Errors in the callback is not a reason to rollback state */
+out:
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
}
-EXPORT_SYMBOL(paddr_vmcoreinfo_note);
-static int __init crash_save_vmcoreinfo_init(void)
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg)
{
- vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
- if (!vmcoreinfo_data) {
- pr_warn("Memory allocation for vmcoreinfo_data failed\n");
- return -ENOMEM;
+ switch (val) {
+ case MEM_ONLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU, arg);
+ break;
+
+ case MEM_OFFLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU, arg);
+ break;
}
+ return NOTIFY_OK;
+}
- vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
- GFP_KERNEL | __GFP_ZERO);
- if (!vmcoreinfo_note) {
- free_page((unsigned long)vmcoreinfo_data);
- vmcoreinfo_data = NULL;
- pr_warn("Memory allocation for vmcoreinfo_note failed\n");
- return -ENOMEM;
- }
+static struct notifier_block crash_memhp_nb = {
+ .notifier_call = crash_memhp_notifier,
+ .priority = 0
+};
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_BUILD_ID();
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
+static int crash_cpuhp_online(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL);
+ return 0;
+}
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_OFFSET(uts_namespace, name);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL);
+ return 0;
+}
-#ifndef CONFIG_NUMA
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL_ARRAY(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
- VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
- VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _refcount);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
- VMCOREINFO_OFFSET(page, compound_head);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLATMEM
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_vmcoreinfo_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_swapbacked);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
- VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
-#endif
+static int __init crash_hotplug_init(void)
+{
+ int result = 0;
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ register_memory_notifier(&crash_memhp_nb);
- return 0;
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
+ "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline);
+ }
+
+ return result;
}
-subsys_initcall(crash_save_vmcoreinfo_init);
+subsys_initcall(crash_hotplug_init);
+#endif
diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c
new file mode 100644
index 000000000000..8aadf6801530
--- /dev/null
+++ b/kernel/crash_core_test.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there
+
+// Helper to create and initialize crash_mem
+static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges,
+ unsigned int nr_initial_ranges,
+ const struct range *initial_ranges)
+{
+ struct crash_mem *mem;
+ size_t alloc_size;
+
+ // Check if max_ranges can even hold initial_ranges
+ if (max_ranges < nr_initial_ranges) {
+ kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n",
+ max_ranges, nr_initial_ranges);
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range);
+ mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL);
+ if (!mem) {
+ kunit_err(test, "Failed to allocate crash_mem\n");
+ return NULL;
+ }
+
+ mem->max_nr_ranges = max_ranges;
+ mem->nr_ranges = nr_initial_ranges;
+ if (initial_ranges && nr_initial_ranges > 0) {
+ memcpy(mem->ranges, initial_ranges,
+ nr_initial_ranges * sizeof(struct range));
+ }
+
+ return mem;
+}
+
+// Helper to compare ranges for assertions
+static void assert_ranges_equal(struct kunit *test,
+ const struct range *actual_ranges,
+ unsigned int actual_nr_ranges,
+ const struct range *expected_ranges,
+ unsigned int expected_nr_ranges,
+ const char *case_name)
+{
+ unsigned int i;
+
+ KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges,
+ "%s: Number of ranges mismatch.", case_name);
+
+ for (i = 0; i < expected_nr_ranges; i++) {
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start,
+ "%s: Range %u start mismatch.", case_name, i);
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end,
+ "%s: Range %u end mismatch.", case_name, i);
+ }
+}
+
+// Structure for test parameters
+struct exclude_test_param {
+ const char *description;
+ unsigned long long exclude_start;
+ unsigned long long exclude_end;
+ unsigned int initial_max_ranges;
+ const struct range *initial_ranges;
+ unsigned int initial_nr_ranges;
+ const struct range *expected_ranges;
+ unsigned int expected_nr_ranges;
+ int expected_ret;
+};
+
+static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params)
+{
+ struct crash_mem *mem;
+ int ret;
+
+ kunit_info(test, "%s", params->description);
+
+ mem = create_crash_mem(test, params->initial_max_ranges,
+ params->initial_nr_ranges, params->initial_ranges);
+ if (!mem)
+ return; // Error already logged by create_crash_mem or kunit_kzalloc
+
+ ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end);
+
+ KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret,
+ "%s: Return value mismatch.", params->description);
+
+ if (params->expected_ret == 0) {
+ assert_ranges_equal(test, mem->ranges, mem->nr_ranges,
+ params->expected_ranges, params->expected_nr_ranges,
+ params->description);
+ } else {
+ // If an error is expected, nr_ranges might still be relevant to check
+ // depending on the exact point of failure. For ENOMEM on split,
+ // nr_ranges shouldn't have changed.
+ KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges,
+ mem->nr_ranges,
+ "%s: Number of ranges mismatch on error.",
+ params->description);
+ }
+}
+
+/*
+ * Test Strategy 1: One to-be-excluded range A and one existing range B.
+ *
+ * Exhaust all possibilities of the position of A regarding B.
+ */
+
+static const struct range single_range_b = { .start = 100, .end = 199 };
+
+static const struct exclude_test_param exclude_single_range_test_data[] = {
+ {
+ .description = "1.1: A is left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 50,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.2: A's right boundary touches B's left boundary",
+ .exclude_start = 10, .exclude_end = 99,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.3: A overlaps B's left part",
+ .exclude_start = 50, .exclude_end = 149,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.4: A is completely inside B",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 119 },
+ { .start = 180, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.5: A overlaps B's right part",
+ .exclude_start = 150, .exclude_end = 249,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.6: A's left boundary touches B's right boundary",
+ .exclude_start = 200, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.7: A is right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 300,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.8: A completely covers B and extends beyond",
+ .exclude_start = 50, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.9: A covers B and extends to the left",
+ .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.10: A covers B and extends to the right",
+ .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.11: A is identical to B",
+ .exclude_start = 100, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.12: A is a point, left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 10,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.13: A is a point, at start of B",
+ .exclude_start = 100, .exclude_end = 100,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.14: A is a point, in middle of B (causes split)",
+ .exclude_start = 150, .exclude_end = 150,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 149 },
+ { .start = 151, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.15: A is a point, at end of B",
+ .exclude_start = 199, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.16: A is a point, right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ // ENOMEM case for single range split
+ {
+ .description = "1.17: A completely inside B (split), no space (ENOMEM)",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 1, // Not enough for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 1, // Should remain unchanged
+ .expected_ret = -ENOMEM,
+ },
+};
+
+
+static void exclude_single_range_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description);
+ run_exclude_test_case(test, &exclude_single_range_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * Test Strategy 2: Regression test.
+ */
+
+static const struct exclude_test_param exclude_range_regression_test_data[] = {
+ // Test data from commit a2e9a95d2190
+ {
+ .description = "2.1: exclude low 1M",
+ .exclude_start = 0, .exclude_end = (1 << 20) - 1,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 0, .end = 0x3efff },
+ { .start = 0x3f000, .end = 0x3ffff },
+ { .start = 0x40000, .end = 0x9ffff }
+ },
+ .initial_nr_ranges = 3,
+ .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u
+ {
+ .description = "2.2: when range out of bound",
+ .exclude_start = 100, .exclude_end = 200,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 1, .end = 299 },
+ { .start = 401, .end = 1000 },
+ { .start = 1001, .end = 2000 }
+ },
+ .initial_nr_ranges = 3,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 3, // Should remain unchanged
+ .expected_ret = -ENOMEM
+ },
+
+};
+
+
+static void exclude_range_regression_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description);
+ run_exclude_test_case(test, &exclude_range_regression_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * KUnit Test Suite
+ */
+static struct kunit_case crash_exclude_mem_range_test_cases[] = {
+ KUNIT_CASE(exclude_single_range_test),
+ KUNIT_CASE(exclude_range_regression_test),
+ {}
+};
+
+static struct kunit_suite crash_exclude_mem_range_suite = {
+ .name = "crash_exclude_mem_range_tests",
+ .test_cases = crash_exclude_mem_range_test_cases,
+ // .init and .exit can be NULL if not needed globally for the suite
+};
+
+kunit_test_suite(crash_exclude_mem_range_suite);
+
+MODULE_DESCRIPTION("crash dump KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
new file mode 100644
index 000000000000..401423ba477d
--- /dev/null
+++ b/kernel/crash_dump_dm_crypt.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <keys/user-type.h>
+#include <linux/crash_dump.h>
+#include <linux/cc_platform.h>
+#include <linux/configfs.h>
+#include <linux/module.h>
+
+#define KEY_NUM_MAX 128 /* maximum dm crypt keys */
+#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */
+#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */
+
+static unsigned int key_count;
+
+struct dm_crypt_key {
+ unsigned int key_size;
+ char key_desc[KEY_DESC_MAX_LEN];
+ u8 data[KEY_SIZE_MAX];
+};
+
+static struct keys_header {
+ unsigned int total_keys;
+ struct dm_crypt_key keys[] __counted_by(total_keys);
+} *keys_header;
+
+static size_t get_keys_header_size(size_t total_keys)
+{
+ return struct_size(keys_header, keys, total_keys);
+}
+
+unsigned long long dm_crypt_keys_addr;
+EXPORT_SYMBOL_GPL(dm_crypt_keys_addr);
+
+static int __init setup_dmcryptkeys(char *arg)
+{
+ char *end;
+
+ if (!arg)
+ return -EINVAL;
+ dm_crypt_keys_addr = memparse(arg, &end);
+ if (end > arg)
+ return 0;
+
+ dm_crypt_keys_addr = 0;
+ return -EINVAL;
+}
+
+early_param("dmcryptkeys", setup_dmcryptkeys);
+
+/*
+ * Architectures may override this function to read dm crypt keys
+ */
+ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+}
+
+static int add_key_to_keyring(struct dm_crypt_key *dm_key,
+ key_ref_t keyring_ref)
+{
+ key_ref_t key_ref;
+ int r;
+
+ /* create or update the requested key and add it to the target keyring */
+ key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc,
+ dm_key->data, dm_key->key_size,
+ KEY_USR_ALL, KEY_ALLOC_IN_QUOTA);
+
+ if (!IS_ERR(key_ref)) {
+ r = key_ref_to_ptr(key_ref)->serial;
+ key_ref_put(key_ref);
+ kexec_dprintk("Success adding key %s", dm_key->key_desc);
+ } else {
+ r = PTR_ERR(key_ref);
+ kexec_dprintk("Error when adding key");
+ }
+
+ key_ref_put(keyring_ref);
+ return r;
+}
+
+static void get_keys_from_kdump_reserved_memory(void)
+{
+ struct keys_header *keys_header_loaded;
+
+ arch_kexec_unprotect_crashkres();
+
+ keys_header_loaded = kmap_local_page(pfn_to_page(
+ kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT));
+
+ memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count));
+ kunmap_local(keys_header_loaded);
+ arch_kexec_protect_crashkres();
+}
+
+static int restore_dm_crypt_keys_to_thread_keyring(void)
+{
+ struct dm_crypt_key *key;
+ size_t keys_header_size;
+ key_ref_t keyring_ref;
+ u64 addr;
+
+ /* find the target keyring (which must be writable) */
+ keyring_ref =
+ lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE);
+ if (IS_ERR(keyring_ref)) {
+ kexec_dprintk("Failed to get the user keyring\n");
+ return PTR_ERR(keyring_ref);
+ }
+
+ addr = dm_crypt_keys_addr;
+ dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr);
+ if (key_count < 0 || key_count > KEY_NUM_MAX) {
+ kexec_dprintk("Failed to read the number of dm-crypt keys\n");
+ return -1;
+ }
+
+ kexec_dprintk("There are %u keys\n", key_count);
+ addr = dm_crypt_keys_addr;
+
+ keys_header_size = get_keys_header_size(key_count);
+ keys_header = kzalloc(keys_header_size, GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr);
+
+ for (int i = 0; i < keys_header->total_keys; i++) {
+ key = &keys_header->keys[i];
+ kexec_dprintk("Get key (size=%u)\n", key->key_size);
+ add_key_to_keyring(key, keyring_ref);
+ }
+
+ return 0;
+}
+
+static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
+{
+ const struct user_key_payload *ukp;
+ struct key *key;
+
+ kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
+ key = request_key(&key_type_logon, dm_key->key_desc, NULL);
+
+ if (IS_ERR(key)) {
+ pr_warn("No such logon key %s\n", dm_key->key_desc);
+ return PTR_ERR(key);
+ }
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return -EKEYREVOKED;
+
+ if (ukp->datalen > KEY_SIZE_MAX) {
+ pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
+ return -EINVAL;
+ }
+
+ memcpy(dm_key->data, ukp->data, ukp->datalen);
+ dm_key->key_size = ukp->datalen;
+ kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
+ dm_key->key_desc, dm_key->data);
+ return 0;
+}
+
+struct config_key {
+ struct config_item item;
+ const char *description;
+};
+
+static inline struct config_key *to_config_key(struct config_item *item)
+{
+ return container_of(item, struct config_key, item);
+}
+
+static ssize_t config_key_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%s\n", to_config_key(item)->description);
+}
+
+static ssize_t config_key_description_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct config_key *config_key = to_config_key(item);
+ size_t len;
+ int ret;
+
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+
+ if (len > KEY_DESC_MAX_LEN) {
+ pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN);
+ return ret;
+ }
+
+ if (!len)
+ return ret;
+
+ kfree(config_key->description);
+ ret = -ENOMEM;
+ config_key->description = kmemdup_nul(page, len, GFP_KERNEL);
+ if (!config_key->description)
+ return ret;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_key_, description);
+
+static struct configfs_attribute *config_key_attrs[] = {
+ &config_key_attr_description,
+ NULL,
+};
+
+static void config_key_release(struct config_item *item)
+{
+ kfree(to_config_key(item));
+ key_count--;
+}
+
+static struct configfs_item_operations config_key_item_ops = {
+ .release = config_key_release,
+};
+
+static const struct config_item_type config_key_type = {
+ .ct_item_ops = &config_key_item_ops,
+ .ct_attrs = config_key_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *config_keys_make_item(struct config_group *group,
+ const char *name)
+{
+ struct config_key *config_key;
+
+ if (key_count > KEY_NUM_MAX) {
+ pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX);
+ return ERR_PTR(-EINVAL);
+ }
+
+ config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL);
+ if (!config_key)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&config_key->item, name, &config_key_type);
+
+ key_count++;
+
+ return &config_key->item;
+}
+
+static ssize_t config_keys_count_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", key_count);
+}
+
+CONFIGFS_ATTR_RO(config_keys_, count);
+
+static bool is_dm_key_reused;
+
+static ssize_t config_keys_reuse_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", is_dm_key_reused);
+}
+
+static ssize_t config_keys_reuse_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) {
+ kexec_dprintk(
+ "dm-crypt keys haven't be saved to crash-reserved memory\n");
+ return -EINVAL;
+ }
+
+ if (kstrtobool(page, &is_dm_key_reused))
+ return -EINVAL;
+
+ if (is_dm_key_reused)
+ get_keys_from_kdump_reserved_memory();
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, reuse);
+
+static struct configfs_attribute *config_keys_attrs[] = {
+ &config_keys_attr_count,
+ &config_keys_attr_reuse,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations config_keys_group_ops = {
+ .make_item = config_keys_make_item,
+};
+
+static const struct config_item_type config_keys_type = {
+ .ct_group_ops = &config_keys_group_ops,
+ .ct_attrs = config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static bool restore;
+
+static ssize_t config_keys_restore_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", restore);
+}
+
+static ssize_t config_keys_restore_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!restore)
+ restore_dm_crypt_keys_to_thread_keyring();
+
+ if (kstrtobool(page, &restore))
+ return -EINVAL;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, restore);
+
+static struct configfs_attribute *kdump_config_keys_attrs[] = {
+ &config_keys_attr_restore,
+ NULL,
+};
+
+static const struct config_item_type kdump_config_keys_type = {
+ .ct_attrs = kdump_config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem config_keys_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "crash_dm_crypt_keys",
+ .ci_type = &config_keys_type,
+ },
+ },
+};
+
+static int build_keys_header(void)
+{
+ struct config_item *item = NULL;
+ struct config_key *key;
+ int i, r;
+
+ if (keys_header != NULL)
+ kvfree(keys_header);
+
+ keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ keys_header->total_keys = key_count;
+
+ i = 0;
+ list_for_each_entry(item, &config_keys_subsys.su_group.cg_children,
+ ci_entry) {
+ if (item->ci_type != &config_key_type)
+ continue;
+
+ key = to_config_key(item);
+
+ if (!key->description) {
+ pr_warn("No key description for key %s\n", item->ci_name);
+ return -EINVAL;
+ }
+
+ strscpy(keys_header->keys[i].key_desc, key->description,
+ KEY_DESC_MAX_LEN);
+ r = read_key_from_user_keying(&keys_header->keys[i]);
+ if (r != 0) {
+ kexec_dprintk("Failed to read key %s\n",
+ keys_header->keys[i].key_desc);
+ return r;
+ }
+ i++;
+ kexec_dprintk("Found key: %s\n", item->ci_name);
+ }
+
+ return 0;
+}
+
+int crash_load_dm_crypt_keys(struct kimage *image)
+{
+ struct kexec_buf kbuf = {
+ .image = image,
+ .buf_min = 0,
+ .buf_max = ULONG_MAX,
+ .top_down = false,
+ .random = true,
+ };
+ int r;
+
+
+ if (key_count <= 0) {
+ kexec_dprintk("No dm-crypt keys\n");
+ return -ENOENT;
+ }
+
+ if (!is_dm_key_reused) {
+ image->dm_crypt_keys_addr = 0;
+ r = build_keys_header();
+ if (r)
+ return r;
+ }
+
+ kbuf.buffer = keys_header;
+ kbuf.bufsz = get_keys_header_size(key_count);
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ r = kexec_add_buffer(&kbuf);
+ if (r) {
+ kvfree((void *)kbuf.buffer);
+ return r;
+ }
+ image->dm_crypt_keys_addr = kbuf.mem;
+ image->dm_crypt_keys_sz = kbuf.bufsz;
+ kexec_dprintk(
+ "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n",
+ kbuf.bufsz, kbuf.memsz);
+
+ return r;
+}
+
+static int __init configfs_dmcrypt_keys_init(void)
+{
+ int ret;
+
+ if (is_kdump_kernel()) {
+ config_keys_subsys.su_group.cg_item.ci_type =
+ &kdump_config_keys_type;
+ }
+
+ config_group_init(&config_keys_subsys.su_group);
+ mutex_init(&config_keys_subsys.su_mutex);
+ ret = configfs_register_subsystem(&config_keys_subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n", ret,
+ config_keys_subsys.su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+
+ return 0;
+
+out_unregister:
+ configfs_unregister_subsystem(&config_keys_subsys);
+
+ return ret;
+}
+
+module_init(configfs_dmcrypt_keys_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 000000000000..62e60e0223cf
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= total_mem) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (total_mem >= start && total_mem < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_CMA 2
+#define SUFFIX_NULL 3
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_CMA] = ",cma",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low|cma]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+ if (!ck_cmdline)
+ return -ENOENT;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ unsigned long long *cma_size,
+ bool *high)
+{
+ int ret;
+ unsigned long long __always_unused cma_base;
+
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
+
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+
+ /*
+ * optional CMA reservation
+ * cma_base is ignored
+ */
+ if (cma_size)
+ __parse_crashkernel(cmdline, 0, cma_size,
+ &cma_base, suffix_tbl[SUFFIX_CMA]);
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ if (*crash_size >= system_ram)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ if (search_end != CRASH_ADDR_HIGH_MAX)
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_res);
+#endif
+}
+
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+ unsigned long long reserved_size = 0;
+
+ if (!cma_size)
+ return;
+
+ while (cma_size > reserved_size &&
+ crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+ struct cma *res;
+
+ if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+ "crashkernel", &res)) {
+ /* reservation failed, try half-sized blocks */
+ if (request_size <= PAGE_SIZE)
+ break;
+
+ request_size = roundup(request_size / 2, PAGE_SIZE);
+ continue;
+ }
+
+ crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+ crashk_cma_ranges[crashk_cma_cnt].end =
+ crashk_cma_ranges[crashk_cma_cnt].start +
+ cma_get_size(res) - 1;
+ ++crashk_cma_cnt;
+ reserved_size += request_size;
+ }
+
+ if (cma_size > reserved_size)
+ pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+ cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+ else
+ pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+ reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ if (cma_size)
+ pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
+#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+static __init int insert_crashkernel_resources(void)
+{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 473d17c431f3..a6f686b30da1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -4,6 +4,9 @@
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
+
+#define pr_fmt(fmt) "CRED: " fmt
+
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
@@ -32,62 +35,6 @@ do { \
static struct kmem_cache *cred_jar;
-/* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-/*
- * The initial credentials for the initial task
- */
-struct cred init_cred = {
- .usage = ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
- .subscribers = ATOMIC_INIT(2),
- .magic = CRED_MAGIC,
-#endif
- .uid = GLOBAL_ROOT_UID,
- .gid = GLOBAL_ROOT_GID,
- .suid = GLOBAL_ROOT_UID,
- .sgid = GLOBAL_ROOT_GID,
- .euid = GLOBAL_ROOT_UID,
- .egid = GLOBAL_ROOT_GID,
- .fsuid = GLOBAL_ROOT_UID,
- .fsgid = GLOBAL_ROOT_GID,
- .securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_EMPTY_SET,
- .cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_FULL_SET,
- .cap_bset = CAP_FULL_SET,
- .user = INIT_USER,
- .user_ns = &init_user_ns,
- .group_info = &init_groups,
- .ucounts = &init_ucounts,
-};
-
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- return atomic_read(&cred->subscribers);
-#else
- return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- struct cred *cred = (struct cred *) _cred;
-
- atomic_add(n, &cred->subscribers);
-#endif
-}
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -97,20 +44,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
kdebug("put_cred_rcu(%p)", cred);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
- read_cred_subscribers(cred) != 0)
- panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
- cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
-#endif
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
security_cred_free(cred);
key_put(cred->session_keyring);
@@ -134,16 +70,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-
- BUG_ON(atomic_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(cred) != 0);
- cred->magic = CRED_MAGIC_DEAD;
- cred->put_addr = __builtin_return_address(0);
-#endif
+ kdebug("__put_cred(%p{%ld})", cred,
+ atomic_long_read(&cred->usage));
+
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
@@ -159,23 +89,23 @@ EXPORT_SYMBOL(__put_cred);
*/
void exit_creds(struct task_struct *tsk)
{
- struct cred *cred;
+ struct cred *real_cred, *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
+ kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage));
- cred = (struct cred *) tsk->real_cred;
+ real_cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
+
+ if (real_cred == cred) {
+ put_cred_many(cred, 2);
+ } else {
+ put_cred(real_cred);
+ put_cred(cred);
+ }
#ifdef CONFIG_KEYS_REQUEST_CACHE
key_put(tsk->cached_requested_key);
@@ -221,10 +151,7 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
+ atomic_long_set(&new->usage, 1);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -255,8 +182,6 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- validate_process_creds();
-
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
@@ -267,8 +192,7 @@ struct cred *prepare_creds(void)
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -291,7 +215,6 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- validate_creds(new);
return new;
error:
@@ -337,7 +260,7 @@ struct cred *prepare_exec_creds(void)
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, u64 clone_flags)
{
struct cred *new;
int ret;
@@ -352,13 +275,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
- p->real_cred = get_cred(p->cred);
- get_cred(p->cred);
- alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
- read_cred_subscribers(p->cred));
+ p->real_cred = get_cred_many(p->cred, 2);
+ kdebug("share_creds(%p{%ld})",
+ p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
return 0;
}
@@ -396,8 +317,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(new, 2);
- validate_creds(new);
+ get_cred_namespaces(p);
+
return 0;
error_put:
@@ -449,17 +370,11 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("commit_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(old) < 2);
- validate_creds(old);
- validate_creds(new);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -494,14 +409,15 @@ int commit_creds(struct cred *new)
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
- alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(old, -2);
+ if (new->user_ns != old->user_ns)
+ switch_cred_namespaces(old, new);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
@@ -517,8 +433,7 @@ int commit_creds(struct cred *new)
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
- put_cred(old);
- put_cred(old);
+ put_cred_many(old, 2);
return 0;
}
EXPORT_SYMBOL(commit_creds);
@@ -532,84 +447,15 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("abort_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(new) != 0);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
/**
- * override_creds - Override the current process's subjective credentials
- * @new: The credentials to be assigned
- *
- * Install a set of temporary override subjective credentials on the current
- * process, returning the old set for later reversion.
- */
-const struct cred *override_creds(const struct cred *new)
-{
- const struct cred *old = current->cred;
-
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
-
- validate_creds(old);
- validate_creds(new);
-
- /*
- * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
- *
- * That means that we do not clear the 'non_rcu' flag, since
- * we are only installing the cred into the thread-synchronous
- * '->cred' pointer, not the '->real_cred' pointer that is
- * visible to other threads under RCU.
- *
- * Also note that we did validate_creds() manually, not depending
- * on the validation in 'get_cred()'.
- */
- get_new_cred((struct cred *)new);
- alter_cred_subscribers(new, 1);
- rcu_assign_pointer(current->cred, new);
- alter_cred_subscribers(old, -1);
-
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
- return old;
-}
-EXPORT_SYMBOL(override_creds);
-
-/**
- * revert_creds - Revert a temporary subjective credentials override
- * @old: The credentials to be restored
- *
- * Revert a temporary set of override subjective credentials to an old set,
- * discarding the override set.
- */
-void revert_creds(const struct cred *old)
-{
- const struct cred *override = current->cred;
-
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
-
- validate_creds(old);
- validate_creds(override);
- alter_cred_subscribers(old, 1);
- rcu_assign_pointer(current->cred, old);
- alter_cred_subscribers(override, -1);
- put_cred(override);
-}
-EXPORT_SYMBOL(revert_creds);
-
-/**
* cred_fscmp - Compare two credentials with respect to filesystem access.
* @a: The first credential
* @b: The second credential
@@ -665,21 +511,16 @@ EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
- struct task_struct *task = current;
- const struct cred *old = task->real_cred;
struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
- if (new->user == old->user && new->user_ns == old->user_ns)
- return 0;
-
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
return 0;
- if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
return -EAGAIN;
new->ucounts = new_ucounts;
@@ -694,8 +535,8 @@ int set_cred_ucounts(struct cred *new)
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ cred_jar = KMEM_CACHE(cred,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
/**
@@ -706,9 +547,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -719,23 +560,20 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
- validate_creds(old);
+ old = get_task_cred(daemon);
*new = *old;
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
@@ -759,7 +597,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
goto error;
put_cred(old);
- validate_creds(new);
return new;
error:
@@ -824,109 +661,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
- if (cred->magic != CRED_MAGIC)
- return true;
- return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
- const struct task_struct *tsk)
-{
- printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
- label, cred,
- cred == &init_cred ? "[init]" : "",
- cred == tsk->real_cred ? "[real]" : "",
- cred == tsk->cred ? "[eff]" : "");
- printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
- cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
- printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- from_kuid_munged(&init_user_ns, cred->uid),
- from_kuid_munged(&init_user_ns, cred->euid),
- from_kuid_munged(&init_user_ns, cred->suid),
- from_kuid_munged(&init_user_ns, cred->fsuid));
- printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- from_kgid_munged(&init_user_ns, cred->gid),
- from_kgid_munged(&init_user_ns, cred->egid),
- from_kgid_munged(&init_user_ns, cred->sgid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
- printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
- if ((unsigned long) cred->security >= PAGE_SIZE &&
- (((unsigned long) cred->security & 0xffffff00) !=
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- printk(KERN_ERR "CRED: ->security {%x, %x}\n",
- ((u32*)cred->security)[0],
- ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
- printk(KERN_ERR "CRED: Invalid credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
- dump_invalid_creds(cred, "Specified", current);
- BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
- const char *file, unsigned line)
-{
- if (tsk->cred == tsk->real_cred) {
- if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- } else {
- if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
- read_cred_subscribers(tsk->cred) < 1 ||
- creds_are_invalid(tsk->real_cred) ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- }
- return;
-
-invalid_creds:
- printk(KERN_ERR "CRED: Invalid process credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
-
- dump_invalid_creds(tsk->real_cred, "Real", tsk);
- if (tsk->cred != tsk->real_cred)
- dump_invalid_creds(tsk->cred, "Effective", tsk);
- else
- printk(KERN_ERR "CRED: Effective creds == Real creds\n");
- BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
- tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
-
- __validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index da06a5553835..0b9495187fba 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -50,9 +50,9 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -282,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
@@ -752,6 +741,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
@@ -825,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
- int ret = 0;
-
- if (arch_kgdb_ops.enable_nmi)
- arch_kgdb_ops.enable_nmi(0);
/*
* Avoid entering the debugger if we were triggered due to an oops
* but panic_timeout indicates the system should automatically
@@ -846,15 +854,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- goto out; /* Ouch, double exception ! */
+ return 0; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- goto out;
+ return 0;
- ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-out:
- if (arch_kgdb_ops.enable_nmi)
- arch_kgdb_ops.enable_nmi(1);
- return ret;
+ return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
}
NOKPROBE_SYMBOL(kgdb_handle_exception);
@@ -956,7 +960,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key)
+static void sysrq_handle_dbg(u8 key)
{
if (!dbg_io_ops) {
pr_crit("ERROR: No KGDB I/O module available\n");
@@ -994,6 +998,9 @@ void kgdb_panic(const char *msg)
if (panic_timeout)
return;
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", msg);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 9d34d2364b5a..22fe969c5d2e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -30,10 +30,11 @@
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
+#include <linux/string.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "debug_core.h"
#define KGDB_MAX_THREAD_QUERY 17
@@ -547,7 +548,7 @@ static void gdb_cmd_setregs(struct kgdb_state *ks)
error_packet(remcom_out_buffer, -EINVAL);
} else {
gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
}
@@ -577,7 +578,7 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
if (err)
error_packet(remcom_out_buffer, err);
else
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
#if DBG_MAX_REG_NUM > 0
@@ -630,7 +631,7 @@ static void gdb_cmd_reg_set(struct kgdb_state *ks)
i = i / 2;
kgdb_hex2mem(ptr, (char *)gdb_regs, i);
dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
#endif /* DBG_MAX_REG_NUM > 0 */
@@ -642,7 +643,7 @@ static void gdb_cmd_binwrite(struct kgdb_state *ks)
if (err)
error_packet(remcom_out_buffer, err);
else
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
/* Handle the 'D' or 'k', detach or kill packets */
@@ -656,7 +657,7 @@ static void gdb_cmd_detachkill(struct kgdb_state *ks)
if (error < 0) {
error_packet(remcom_out_buffer, error);
} else {
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
kgdb_connected = 0;
}
put_packet(remcom_out_buffer);
@@ -676,7 +677,7 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
/* For now, only honor R0 */
if (strcmp(remcom_in_buffer, "R0") == 0) {
printk(KERN_CRIT "Executing emergency reboot\n");
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
put_packet(remcom_out_buffer);
/*
@@ -739,7 +740,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
case 'C':
/* Current thread id */
- strcpy(remcom_out_buffer, "QC");
+ strscpy(remcom_out_buffer, "QC");
ks->threadid = shadow_pid(current->pid);
int_to_threadref(thref, ks->threadid);
pack_threadid(remcom_out_buffer + 2, thref);
@@ -773,7 +774,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
int len = strlen(remcom_in_buffer + 6);
if ((len % 2) != 0) {
- strcpy(remcom_out_buffer, "E01");
+ strscpy(remcom_out_buffer, "E01");
break;
}
kgdb_hex2mem(remcom_in_buffer + 6,
@@ -785,14 +786,14 @@ static void gdb_cmd_query(struct kgdb_state *ks)
kdb_parse(remcom_out_buffer);
kdb_common_deinit_state();
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
break;
#endif
#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT
case 'S':
if (!strncmp(remcom_in_buffer, "qSupported:", 11))
- strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature);
+ strscpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature);
break;
case 'X':
if (!strncmp(remcom_in_buffer, "qXfer:", 6))
@@ -822,7 +823,7 @@ static void gdb_cmd_task(struct kgdb_state *ks)
}
kgdb_usethread = thread;
ks->kgdb_usethreadid = ks->threadid;
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
break;
case 'c':
ptr = &remcom_in_buffer[2];
@@ -837,7 +838,7 @@ static void gdb_cmd_task(struct kgdb_state *ks)
}
kgdb_contthread = thread;
}
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
break;
}
}
@@ -851,7 +852,7 @@ static void gdb_cmd_thread(struct kgdb_state *ks)
kgdb_hex2long(&ptr, &ks->threadid);
thread = getthread(ks->linux_regs, ks->threadid);
if (thread)
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
else
error_packet(remcom_out_buffer, -EINVAL);
}
@@ -913,7 +914,7 @@ static void gdb_cmd_break(struct kgdb_state *ks)
(int) length, *bpt_type - '0');
if (error == 0)
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
else
error_packet(remcom_out_buffer, error);
}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 372025cf1ca3..c0c2072f5452 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -460,13 +460,15 @@ static int kdb_bc(int argc, const char **argv)
break;
case KDBCMD_BE:
+ if (bp->bp_enabled)
+ break;
+
bp->bp_enabled = 1;
kdb_printf("Breakpoint %d at "
- kdb_bfd_vma_fmt " enabled",
+ kdb_bfd_vma_fmt " enabled\n",
i, bp->bp_addr);
- kdb_printf("\n");
break;
case KDBCMD_BD:
if (!bp->bp_enabled)
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 10b454554ab0..137ba73f56fc 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -144,7 +144,7 @@ kdb_bt(int argc, const char **argv)
kdb_ps_suppressed();
/* Run the active tasks first */
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_bt1(p, mask, btaprompt))
return 0;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6735ac36b718..61c1690058ed 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
@@ -132,6 +131,7 @@ char kdb_getchar(void)
int escape_delay = 0;
get_char_func *f, *f_prev = NULL;
int key;
+ static bool last_char_was_cr;
for (f = &kdb_poll_funcs[0]; ; ++f) {
if (*f == NULL) {
@@ -151,6 +151,18 @@ char kdb_getchar(void)
}
/*
+ * The caller expects that newlines are either CR or LF. However
+ * some terminals send _both_ CR and LF. Avoid having to handle
+ * this in the caller by stripping the LF if we saw a CR right
+ * before.
+ */
+ if (last_char_was_cr && key == '\n') {
+ last_char_was_cr = false;
+ continue;
+ }
+ last_char_was_cr = (key == '\r');
+
+ /*
* When the first character is received (or we get a change
* input source) we set ourselves up to handle an escape
* sequences (just in case).
@@ -172,6 +184,33 @@ char kdb_getchar(void)
unreachable();
}
+/**
+ * kdb_position_cursor() - Place cursor in the correct horizontal position
+ * @prompt: Nil-terminated string containing the prompt string
+ * @buffer: Nil-terminated string containing the entire command line
+ * @cp: Cursor position, pointer the character in buffer where the cursor
+ * should be positioned.
+ *
+ * The cursor is positioned by sending a carriage-return and then printing
+ * the content of the line until we reach the correct cursor position.
+ *
+ * There is some additional fine detail here.
+ *
+ * Firstly, even though kdb_printf() will correctly format zero-width fields
+ * we want the second call to kdb_printf() to be conditional. That keeps things
+ * a little cleaner when LOGGING=1.
+ *
+ * Secondly, we can't combine everything into one call to kdb_printf() since
+ * that renders into a fixed length buffer and the combined print could result
+ * in unwanted truncation.
+ */
+static void kdb_position_cursor(char *prompt, char *buffer, char *cp)
+{
+ kdb_printf("\r%s", prompt);
+ if (cp > buffer)
+ kdb_printf("%.*s", (int)(cp - buffer), buffer);
+}
+
/*
* kdb_read
*
@@ -208,8 +247,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int count;
int i;
int diag, dtab_count;
- int key, buf_size, ret;
-
+ int key, ret;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -231,21 +269,15 @@ poll_again:
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
- if (cp < lastchar) {
- memcpy(tmpbuffer, cp, lastchar - cp);
- memcpy(cp-1, tmpbuffer, lastchar - cp);
- }
- *(--lastchar) = '\0';
- --cp;
- kdb_printf("\b%s \r", cp);
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ memmove(cp-1, cp, lastchar - cp + 1);
+ lastchar--;
+ cp--;
+ kdb_printf("\b%s ", cp);
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
- case 13: /* enter */
+ case 10: /* linefeed */
+ case 13: /* carriage return */
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
@@ -256,22 +288,16 @@ poll_again:
return buffer;
case 4: /* Del */
if (cp < lastchar) {
- memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
- memcpy(cp, tmpbuffer, lastchar - cp - 1);
- *(--lastchar) = '\0';
- kdb_printf("%s \r", cp);
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ memmove(cp, cp+1, lastchar - cp);
+ lastchar--;
+ kdb_printf("%s ", cp);
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
case 1: /* Home */
if (cp > buffer) {
- kdb_printf("\r");
- kdb_printf(kdb_prompt_str);
cp = buffer;
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
case 5: /* End */
@@ -287,11 +313,10 @@ poll_again:
}
break;
case 14: /* Down */
- memset(tmpbuffer, ' ',
- strlen(kdb_prompt_str) + (lastchar-buffer));
- *(tmpbuffer+strlen(kdb_prompt_str) +
- (lastchar-buffer)) = '\0';
- kdb_printf("\r%s\r", tmpbuffer);
+ case 16: /* Up */
+ kdb_printf("\r%*c\r",
+ (int)(strlen(kdb_prompt_str) + (lastchar - buffer)),
+ ' ');
*lastchar = (char)key;
*(lastchar+1) = '\0';
return lastchar;
@@ -301,33 +326,19 @@ poll_again:
++cp;
}
break;
- case 16: /* Up */
- memset(tmpbuffer, ' ',
- strlen(kdb_prompt_str) + (lastchar-buffer));
- *(tmpbuffer+strlen(kdb_prompt_str) +
- (lastchar-buffer)) = '\0';
- kdb_printf("\r%s\r", tmpbuffer);
- *lastchar = (char)key;
- *(lastchar+1) = '\0';
- return lastchar;
case 9: /* Tab */
if (tab < 2)
++tab;
- p_tmp = buffer;
- while (*p_tmp == ' ')
- p_tmp++;
- if (p_tmp > cp)
- break;
- memcpy(tmpbuffer, p_tmp, cp-p_tmp);
- *(tmpbuffer + (cp-p_tmp)) = '\0';
- p_tmp = strrchr(tmpbuffer, ' ');
- if (p_tmp)
- ++p_tmp;
- else
- p_tmp = tmpbuffer;
- len = strlen(p_tmp);
- buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer);
- count = kallsyms_symbol_complete(p_tmp, buf_size);
+
+ tmp = *cp;
+ *cp = '\0';
+ p_tmp = strrchr(buffer, ' ');
+ p_tmp = (p_tmp ? p_tmp + 1 : buffer);
+ strscpy(tmpbuffer, p_tmp);
+ *cp = tmp;
+
+ len = strlen(tmpbuffer);
+ count = kallsyms_symbol_complete(tmpbuffer, sizeof(tmpbuffer));
if (tab == 2 && count > 0) {
kdb_printf("\n%d symbols are found.", count);
if (count > dtab_count) {
@@ -339,46 +350,51 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- ret = kallsyms_symbol_next(p_tmp, i, buf_size);
+ ret = kallsyms_symbol_next(tmpbuffer, i, sizeof(tmpbuffer));
if (WARN_ON(!ret))
break;
if (ret != -E2BIG)
- kdb_printf("%s ", p_tmp);
+ kdb_printf("%s ", tmpbuffer);
else
- kdb_printf("%s... ", p_tmp);
- *(p_tmp + len) = '\0';
+ kdb_printf("%s... ", tmpbuffer);
+ tmpbuffer[len] = '\0';
}
if (i >= dtab_count)
kdb_printf("...");
kdb_printf("\n");
- kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", kdb_prompt_str);
kdb_printf("%s", buffer);
+ if (cp != lastchar)
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
} else if (tab != 2 && count > 0) {
- len_tmp = strlen(p_tmp);
- strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
- len_tmp = strlen(p_tmp);
- strncpy(cp, p_tmp+len, len_tmp-len + 1);
- len = len_tmp - len;
- kdb_printf("%s", cp);
- cp += len;
- lastchar += len;
+ /* How many new characters do we want from tmpbuffer? */
+ len_tmp = strlen(tmpbuffer) - len;
+ if (lastchar + len_tmp >= bufend)
+ len_tmp = bufend - lastchar;
+
+ if (len_tmp) {
+ /* + 1 ensures the '\0' is memmove'd */
+ memmove(cp+len_tmp, cp, (lastchar-cp) + 1);
+ memcpy(cp, tmpbuffer+len, len_tmp);
+ kdb_printf("%s", cp);
+ cp += len_tmp;
+ lastchar += len_tmp;
+ if (cp != lastchar)
+ kdb_position_cursor(kdb_prompt_str,
+ buffer, cp);
+ }
}
kdb_nextline = 1; /* reset output line number */
break;
default:
if (key >= 32 && lastchar < bufend) {
if (cp < lastchar) {
- memcpy(tmpbuffer, cp, lastchar - cp);
- memcpy(cp+1, tmpbuffer, lastchar - cp);
- *++lastchar = '\0';
+ memmove(cp+1, cp, lastchar - cp + 1);
+ lastchar++;
*cp = key;
- kdb_printf("%s\r", cp);
+ kdb_printf("%s", cp);
++cp;
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
} else {
*++lastchar = '\0';
*cp++ = key;
@@ -436,8 +452,8 @@ poll_again:
char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
- strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
- kdb_printf(kdb_prompt_str);
+ strscpy(kdb_prompt_str, prompt);
+ kdb_printf("%s", kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
}
@@ -546,6 +562,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
const char *cp;
+ int cookie;
int len;
if (msg_len == 0)
@@ -559,25 +576,57 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
- for_each_console(c) {
- if (!(c->flags & CON_ENABLED))
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ short flags = console_srcu_read_flags(c);
+
+ if (!console_is_usable(c, flags, true))
continue;
if (c == dbg_io_ops->cons)
continue;
- /*
- * Set oops_in_progress to encourage the console drivers to
- * disregard their internal spin locks: in the current calling
- * context the risk of deadlock is a bigger problem than risks
- * due to re-entering the console driver. We operate directly on
- * oops_in_progress rather than using bust_spinlocks() because
- * the calls bust_spinlocks() makes on exit are not appropriate
- * for this calling context.
- */
- ++oops_in_progress;
- c->write(c, msg, msg_len);
- --oops_in_progress;
+
+ if (flags & CON_NBCON) {
+ struct nbcon_write_context wctxt = { };
+
+ /*
+ * Do not continue if the console is NBCON and the context
+ * can't be acquired.
+ */
+ if (!nbcon_kdb_try_acquire(c, &wctxt))
+ continue;
+
+ nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len);
+
+ c->write_atomic(c, &wctxt);
+ nbcon_kdb_release(&wctxt);
+ } else {
+ /*
+ * Set oops_in_progress to encourage the console drivers to
+ * disregard their internal spin locks: in the current calling
+ * context the risk of deadlock is a bigger problem than risks
+ * due to re-entering the console driver. We operate directly on
+ * oops_in_progress rather than using bust_spinlocks() because
+ * the calls bust_spinlocks() makes on exit are not appropriate
+ * for this calling context.
+ */
+ ++oops_in_progress;
+ c->write(c, msg, msg_len);
+ --oops_in_progress;
+ }
touch_nmi_watchdog();
}
+ console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
@@ -682,8 +731,8 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
* it, depending on the results of the search.
*/
cp++; /* to byte after the newline */
- replaced_byte = *cp; /* remember what/where it was */
- cphold = cp;
+ replaced_byte = *cp; /* remember what it was */
+ cphold = cp; /* remember where it was */
*cp = '\0'; /* end the string for our search */
/*
@@ -700,8 +749,9 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
* Shift the buffer left.
*/
*cphold = replaced_byte;
- strcpy(kdb_buffer, cphold);
- len = strlen(kdb_buffer);
+ len = strlen(cphold);
+ /* Use memmove() because the buffers overlap */
+ memmove(kdb_buffer, cphold, len + 1);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
@@ -840,8 +890,9 @@ kdb_printit:
*/
if (kdb_grepping_flag && !suspend_grep) {
*cphold = replaced_byte;
- strcpy(kdb_buffer, cphold);
- len = strlen(kdb_buffer);
+ len = strlen(cphold);
+ /* Use memmove() because the buffers overlap */
+ memmove(kdb_buffer, cphold, len + 1);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
}
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index f877a0a0d7cf..386d30e530b7 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,9 +11,10 @@
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/io.h>
+#include "kdb_private.h"
+
/* Keyboard Controller Registers on normal PCs. */
#define KBD_STATUS_REG 0x64 /* Status register (R) */
@@ -24,6 +25,8 @@
#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+#define CTRL(c) ((c) - 64)
+
static int kbd_exists;
static int kbd_last_ret;
@@ -122,29 +125,26 @@ int kdb_get_kbd_char(void)
return 8;
}
- /* Special Key */
+ /* Translate special keys to equivalent CTRL control characters */
switch (scancode) {
case 0xF: /* Tab */
- return 9;
+ return CTRL('I');
case 0x53: /* Del */
- return 4;
+ return CTRL('D');
case 0x47: /* Home */
- return 1;
+ return CTRL('A');
case 0x4F: /* End */
- return 5;
+ return CTRL('E');
case 0x4B: /* Left */
- return 2;
+ return CTRL('B');
case 0x48: /* Up */
- return 16;
+ return CTRL('P');
case 0x50: /* Down */
- return 14;
+ return CTRL('N');
case 0x4D: /* Right */
- return 6;
+ return CTRL('F');
}
- if (scancode == 0xe0)
- return -1;
-
/*
* For Japanese 86/106 keyboards
* See comment in drivers/char/pc_keyb.c.
@@ -171,6 +171,19 @@ int kdb_get_kbd_char(void)
switch (KTYP(keychar)) {
case KT_LETTER:
case KT_LATIN:
+ switch (keychar) {
+ /* non-printable supported control characters */
+ case CTRL('A'): /* Home */
+ case CTRL('B'): /* Left */
+ case CTRL('D'): /* Del */
+ case CTRL('E'): /* End */
+ case CTRL('F'): /* Right */
+ case CTRL('I'): /* Tab */
+ case CTRL('N'): /* Down */
+ case CTRL('P'): /* Up */
+ return keychar;
+ }
+
if (isprint(keychar))
break; /* printable characters */
fallthrough;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0852a537dad4..dddf2b5aad57 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -25,8 +25,6 @@
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
-#include <linux/atomic.h>
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -45,6 +43,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -105,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = {
KDBMSG(NOENVVALUE, "Environment variable should have value"),
KDBMSG(NOTIMP, "Command not implemented"),
KDBMSG(ENVFULL, "Environment full"),
- KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(KMALLOCFAILED, "Failed to allocate memory"),
KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
#ifdef CONFIG_CPU_XSCALE
KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
@@ -130,13 +129,9 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
- * Initial environment. This is all kept static and local to
- * this file. We don't want to rely on the memory allocation
- * mechanisms in the kernel, so we use a very limited allocate-only
- * heap for new and altered environment variables. The entire
- * environment is limited to a fixed number of entries (add more
- * to __env[] if required) and a fixed amount of heap (add more to
- * KDB_ENVBUFSIZE if required).
+ * Initial environment. This is all kept static and local to this file.
+ * The entire environment is limited to a fixed number of entries
+ * (add more to __env[] if required)
*/
static char *__env[31] = {
@@ -155,21 +150,63 @@ static char *__env[31] = {
static const int __nenv = ARRAY_SIZE(__env);
-struct task_struct *kdb_curr_task(int cpu)
+/*
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
{
- struct task_struct *p = curr_task(cpu);
-#ifdef _TIF_MCA_INIT
- if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
- p = krp->p;
-#endif
- return p;
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -217,36 +254,6 @@ char *kdbgetenv(const char *match)
}
/*
- * kdballocenv - This function is used to allocate bytes for
- * environment entries.
- * Parameters:
- * match A character string representing a numeric value
- * Outputs:
- * *value the unsigned long representation of the env variable 'match'
- * Returns:
- * Zero on success, a kdb diagnostic on failure.
- * Remarks:
- * We use a static environment buffer (envbuffer) to hold the values
- * of dynamically generated environment variables (see kdb_set). Buffer
- * space once allocated is never free'd, so over time, the amount of space
- * (currently 512 bytes) will be exhausted if env variables are changed
- * frequently.
- */
-static char *kdballocenv(size_t bytes)
-{
-#define KDB_ENVBUFSIZE 512
- static char envbuffer[KDB_ENVBUFSIZE];
- static int envbufsize;
- char *ep = NULL;
-
- if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
- ep = &envbuffer[envbufsize];
- envbufsize += bytes;
- }
- return ep;
-}
-
-/*
* kdbgetulenv - This function will return the value of an unsigned
* long-valued environment variable.
* Parameters:
@@ -265,8 +272,8 @@ static int kdbgetulenv(const char *match, unsigned long *value)
return KDB_NOTENV;
if (strlen(ep) == 0)
return KDB_NOENVVALUE;
-
- *value = simple_strtoul(ep, NULL, 0);
+ if (kstrtoul(ep, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -307,9 +314,9 @@ static int kdb_setenv(const char *var, const char *val)
varlen = strlen(var);
vallen = strlen(val);
- ep = kdballocenv(varlen + vallen + 2);
- if (ep == (char *)0)
- return KDB_ENVBUFFULL;
+ ep = kmalloc(varlen + vallen + 2, GFP_KDB);
+ if (!ep)
+ return KDB_KMALLOCFAILED;
sprintf(ep, "%s=%s", var, val);
@@ -318,6 +325,7 @@ static int kdb_setenv(const char *var, const char *val)
&& ((strncmp(__env[i], var, varlen) == 0)
&& ((__env[i][varlen] == '\0')
|| (__env[i][varlen] == '=')))) {
+ kfree_const(__env[i]);
__env[i] = ep;
return 0;
}
@@ -361,42 +369,15 @@ static void kdb_printenv(void)
*/
int kdbgetularg(const char *arg, unsigned long *value)
{
- char *endp;
- unsigned long val;
-
- val = simple_strtoul(arg, &endp, 0);
-
- if (endp == arg) {
- /*
- * Also try base 16, for us folks too lazy to type the
- * leading 0x...
- */
- val = simple_strtoul(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtoul(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
int kdbgetu64arg(const char *arg, u64 *value)
{
- char *endp;
- u64 val;
-
- val = simple_strtoull(arg, &endp, 0);
-
- if (endp == arg) {
-
- val = simple_strtoull(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtou64(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -432,10 +413,10 @@ int kdb_set(int argc, const char **argv)
*/
if (strcmp(argv[1], "KDBDEBUG") == 0) {
unsigned int debugflags;
- char *cp;
+ int ret;
- debugflags = simple_strtoul(argv[2], &cp, 0);
- if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ ret = kstrtouint(argv[2], 0, &debugflags);
+ if (ret || debugflags & ~KDB_DEBUG_FLAG_MASK) {
kdb_printf("kdb: illegal debug flags '%s'\n",
argv[2]);
return 0;
@@ -740,20 +721,12 @@ static int kdb_defcmd(int argc, const char **argv)
mp->name = kdb_strdup(argv[1], GFP_KDB);
if (!mp->name)
goto fail_name;
- mp->usage = kdb_strdup(argv[2], GFP_KDB);
+ mp->usage = kdb_strdup_dequote(argv[2], GFP_KDB);
if (!mp->usage)
goto fail_usage;
- mp->help = kdb_strdup(argv[3], GFP_KDB);
+ mp->help = kdb_strdup_dequote(argv[3], GFP_KDB);
if (!mp->help)
goto fail_help;
- if (mp->usage[0] == '"') {
- strcpy(mp->usage, argv[2]+1);
- mp->usage[strlen(mp->usage)-1] = '\0';
- }
- if (mp->help[0] == '"') {
- strcpy(mp->help, argv[3]+1);
- mp->help[strlen(mp->help)-1] = '\0';
- }
INIT_LIST_HEAD(&kdb_macro->statements);
defcmd_in_progress = true;
@@ -879,7 +852,7 @@ static void parse_grep(const char *str)
kdb_printf("search string too long\n");
return;
}
- strcpy(kdb_grep_string, cp);
+ memcpy(kdb_grep_string, cp, len + 1);
kdb_grepping_flag++;
return;
}
@@ -1177,9 +1150,12 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
char *cmdbuf;
int diag;
struct task_struct *kdb_current =
- kdb_curr_task(raw_smp_processor_id());
+ curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
@@ -1294,8 +1270,6 @@ do_full_getstr:
/* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
@@ -1577,10 +1551,10 @@ static int kdb_md(int argc, const char **argv)
if (!argv[0][3])
valid = 1;
else if (argv[0][3] == 'c' && argv[0][4]) {
- char *p;
- repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ if (kstrtouint(argv[0] + 4, 10, &repeat))
+ return KDB_BADINT;
mdcount = ((repeat * bytesperword) + 15) / 16;
- valid = !*p;
+ valid = 1;
}
last_repeat = repeat;
} else if (strcmp(argv[0], "md") == 0)
@@ -2004,54 +1978,6 @@ static int kdb_ef(int argc, const char **argv)
return 0;
}
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command. Lists
- * currently loaded kernel modules.
- * Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
- struct module *mod;
-
- if (argc != 0)
- return KDB_ARGCOUNT;
-
- kdb_printf("Module Size modstruct Used by\n");
- list_for_each_entry(mod, kdb_modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- kdb_printf("%-20s%8u 0x%px ", mod->name,
- mod->core_layout.size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
-#endif
- if (mod->state == MODULE_STATE_GOING)
- kdb_printf(" (Unloading)");
- else if (mod->state == MODULE_STATE_COMING)
- kdb_printf(" (Loading)");
- else
- kdb_printf(" (Live)");
- kdb_printf(" 0x%px", mod->core_layout.base);
-
-#ifdef CONFIG_MODULE_UNLOAD
- {
- struct module_use *use;
- kdb_printf(" [ ");
- list_for_each_entry(use, &mod->source_list,
- source_list)
- kdb_printf("%s ", use->target->name);
- kdb_printf("]\n");
- }
-#endif
- }
-
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
/*
* kdb_env - This function implements the 'env' command. Display the
* current environment variables.
@@ -2089,15 +2015,10 @@ static int kdb_dmesg(int argc, const char **argv)
if (argc > 2)
return KDB_ARGCOUNT;
if (argc) {
- char *cp;
- lines = simple_strtol(argv[1], &cp, 0);
- if (*cp)
+ if (kstrtoint(argv[1], 0, &lines))
lines = 0;
- if (argc > 1) {
- adjust = simple_strtoul(argv[2], &cp, 0);
- if (*cp || adjust < 0)
- adjust = 0;
- }
+ if (argc > 1 && (kstrtoint(argv[2], 0, &adjust) || adjust < 0))
+ adjust = 0;
}
/* disable LOGGING if set */
@@ -2157,32 +2078,6 @@ static int kdb_dmesg(int argc, const char **argv)
return 0;
}
#endif /* CONFIG_PRINTK */
-
-/* Make sure we balance enable/disable calls, must disable first. */
-static atomic_t kdb_nmi_disabled;
-
-static int kdb_disable_nmi(int argc, const char *argv[])
-{
- if (atomic_read(&kdb_nmi_disabled))
- return 0;
- atomic_set(&kdb_nmi_disabled, 1);
- arch_kgdb_ops.enable_nmi(0);
- return 0;
-}
-
-static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
-{
- if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
- return -EINVAL;
- arch_kgdb_ops.enable_nmi(1);
- return 0;
-}
-
-static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
- .set = kdb_param_enable_nmi,
-};
-module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
-
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2274,7 +2169,7 @@ void kdb_ps_suppressed(void)
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_task_state(p, "-"))
++idle;
}
@@ -2310,7 +2205,7 @@ void kdb_ps1(const struct task_struct *p)
kdb_task_has_cpu(p), kdb_process_cpu(p),
kdb_task_state_char(p),
(void *)(&p->thread),
- p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p == curr_task(raw_smp_processor_id()) ? '*' : ' ',
p->comm);
if (kdb_task_has_cpu(p)) {
if (!KDB_TSK(cpu)) {
@@ -2346,7 +2241,7 @@ static int kdb_ps(int argc, const char **argv)
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_task_state(p, mask))
kdb_ps1(p);
}
@@ -2434,14 +2329,12 @@ static int kdb_help(int argc, const char **argv)
static int kdb_kill(int argc, const char **argv)
{
long sig, pid;
- char *endp;
struct task_struct *p;
if (argc != 2)
return KDB_ARGCOUNT;
- sig = simple_strtol(argv[1], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[1], 0, &sig))
return KDB_BADINT;
if ((sig >= 0) || !valid_signal(-sig)) {
kdb_printf("Invalid signal parameter.<-signal>\n");
@@ -2449,8 +2342,7 @@ static int kdb_kill(int argc, const char **argv)
}
sig = -sig;
- pid = simple_strtol(argv[2], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[2], 0, &pid))
return KDB_BADINT;
if (pid <= 0) {
kdb_printf("Process ID must be large than 0.\n");
@@ -2513,7 +2405,7 @@ static int kdb_summary(int argc, const char **argv)
if (val.uptime > (24*60*60)) {
int days = val.uptime / (24*60*60);
val.uptime %= (24*60*60);
- kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ kdb_printf("%d day%s ", days, str_plural(days));
}
kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
@@ -2877,20 +2769,10 @@ static kdbtab_t maintab[] = {
},
};
-static kdbtab_t nmicmd = {
- .name = "disable_nmi",
- .func = kdb_disable_nmi,
- .usage = "",
- .help = "Disable NMI entry to KDB",
- .flags = KDB_ENABLE_ALWAYS_SAFE,
-};
-
/* Initialize the kdb command table. */
static void __init kdb_inittab(void)
{
kdb_register_table(maintab, ARRAY_SIZE(maintab));
- if (arch_kgdb_ops.enable_nmi)
- kdb_register_table(&nmicmd, 1);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 0d2f9feea0a4..a2fc7d2bc9fc 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -110,6 +110,7 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
extern char *kdb_strdup(const char *str, gfp_t type);
+extern char *kdb_strdup_dequote(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
/* Routine for debugging the debugger state. */
@@ -194,7 +195,6 @@ extern char kdb_task_state_char (const struct task_struct *);
extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_send_sig(struct task_struct *p, int sig);
extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
@@ -211,8 +211,6 @@ extern void kdb_gdb_state_pass(char *buf);
#define KDB_TSK(cpu) kgdb_info[cpu].task
#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
-extern struct task_struct *kdb_curr_task(int);
-
#define kdb_task_has_cpu(p) (task_curr(p))
#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
@@ -226,10 +224,6 @@ extern void kdb_kbd_cleanup_state(void);
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..56f7b906e7cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -17,13 +17,13 @@
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/ctype.h>
#include "kdb_private.h"
@@ -247,11 +247,41 @@ void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
*/
char *kdb_strdup(const char *str, gfp_t type)
{
- int n = strlen(str)+1;
+ size_t n = strlen(str) + 1;
char *s = kmalloc(n, type);
if (!s)
return NULL;
- return strcpy(s, str);
+ memcpy(s, str, n);
+ return s;
+}
+
+/*
+ * kdb_strdup_dequote - same as kdb_strdup(), but trims surrounding quotes from
+ * the input string if present.
+ * Remarks:
+ * Quotes are only removed if there is both a leading and a trailing quote.
+ */
+char *kdb_strdup_dequote(const char *str, gfp_t type)
+{
+ size_t len = strlen(str);
+ char *s;
+
+ if (str[0] == '"' && len > 1 && str[len - 1] == '"') {
+ /* trim both leading and trailing quotes */
+ str++;
+ len -= 2;
+ }
+
+ len++; /* add space for NUL terminator */
+
+ s = kmalloc(len, type);
+ if (!s)
+ return NULL;
+
+ memcpy(s, str, len - 1);
+ s[len - 1] = '\0';
+
+ return s;
}
/*
@@ -291,7 +321,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_func_printf("Bad address 0x%lx\n", addr);
@@ -306,7 +336,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size)
/*
* kdb_getphys - Read data from a physical address. Validate the
- * address is in range, use kmap_atomic() to get data
+ * address is in range, use kmap_local_page() to get data
* similar to kdb_getarea() - but for phys addresses
* Inputs:
* res Pointer to the word to receive the result
@@ -325,9 +355,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
- vaddr = kmap_atomic(page);
+ vaddr = kmap_local_page(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
return 0;
}
@@ -537,21 +567,3 @@ bool kdb_task_state(const struct task_struct *p, const char *mask)
return strchr(mask, state);
}
-
-/* Maintain a small stack of kdb_flags to allow recursion without disturbing
- * the global kdb state.
- */
-
-static int kdb_flags_stack[4], kdb_flags_index;
-
-void kdb_save_flags(void)
-{
- BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
- kdb_flags_stack[kdb_flags_index++] = kdb_flags;
-}
-
-void kdb_restore_flags(void)
-{
- BUG_ON(kdb_flags_index <= 0);
- kdb_flags = kdb_flags_stack[--kdb_flags_index];
-}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c5e8cea9e05f..30e7912ebb0d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,15 @@
#include <linux/delayacct.h>
#include <linux/module.h>
+#define UPDATE_DELAY(type) \
+do { \
+ d->type##_delay_max = tsk->delays->type##_delay_max; \
+ d->type##_delay_min = tsk->delays->type##_delay_min; \
+ tmp = d->type##_delay_total + tsk->delays->type##_delay; \
+ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
+ d->type##_count += tsk->delays->type##_count; \
+} while (0)
+
DEFINE_STATIC_KEY_FALSE(delayacct_key);
int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
@@ -44,7 +53,7 @@ void delayacct_init(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int state = delayacct_on;
@@ -63,6 +72,25 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
set_delayacct(state);
return err;
}
+
+static const struct ctl_table kern_delayacct_table[] = {
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_delayacct_table);
+ return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
#endif
void __delayacct_tsk_init(struct task_struct *tsk)
@@ -74,9 +102,9 @@ void __delayacct_tsk_init(struct task_struct *tsk)
/*
* Finish delay accounting for a statistic using its timestamps (@start),
- * accumalator (@total) and @count
+ * accumulator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
{
s64 ns = local_clock() - *start;
unsigned long flags;
@@ -85,6 +113,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
+ if (ns > *max)
+ *max = ns;
+ if (*min == 0 || ns < *min)
+ *min = ns;
raw_spin_unlock_irqrestore(lock, flags);
}
}
@@ -103,7 +135,9 @@ void __delayacct_blkio_end(struct task_struct *p)
delayacct_end(&p->delays->lock,
&p->delays->blkio_start,
&p->delays->blkio_delay,
- &p->delays->blkio_count);
+ &p->delays->blkio_count,
+ &p->delays->blkio_delay_max,
+ &p->delays->blkio_delay_min);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -134,10 +168,12 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_count += t1;
+ d->cpu_delay_max = tsk->sched_info.max_run_delay;
+ d->cpu_delay_min = tsk->sched_info.min_run_delay;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
-
tmp = (s64)d->cpu_run_virtual_total + t3;
+
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
@@ -145,23 +181,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
return 0;
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
-
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
- d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
- tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
- d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
- tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
- d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
- tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
- d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
- tmp = d->compact_delay_total + tsk->delays->compact_delay;
- d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
- d->blkio_count += tsk->delays->blkio_count;
- d->swapin_count += tsk->delays->swapin_count;
- d->freepages_count += tsk->delays->freepages_count;
- d->thrashing_count += tsk->delays->thrashing_count;
- d->compact_count += tsk->delays->compact_count;
+ UPDATE_DELAY(blkio);
+ UPDATE_DELAY(swapin);
+ UPDATE_DELAY(freepages);
+ UPDATE_DELAY(thrashing);
+ UPDATE_DELAY(compact);
+ UPDATE_DELAY(wpcopy);
+ UPDATE_DELAY(irq);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -188,20 +215,33 @@ void __delayacct_freepages_end(void)
delayacct_end(&current->delays->lock,
&current->delays->freepages_start,
&current->delays->freepages_delay,
- &current->delays->freepages_count);
+ &current->delays->freepages_count,
+ &current->delays->freepages_delay_max,
+ &current->delays->freepages_delay_min);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
- &current->delays->thrashing_count);
+ &current->delays->thrashing_count,
+ &current->delays->thrashing_delay_max,
+ &current->delays->thrashing_delay_min);
}
void __delayacct_swapin_start(void)
@@ -214,7 +254,9 @@ void __delayacct_swapin_end(void)
delayacct_end(&current->delays->lock,
&current->delays->swapin_start,
&current->delays->swapin_delay,
- &current->delays->swapin_count);
+ &current->delays->swapin_count,
+ &current->delays->swapin_delay_max,
+ &current->delays->swapin_delay_min);
}
void __delayacct_compact_start(void)
@@ -227,5 +269,37 @@ void __delayacct_compact_end(void)
delayacct_end(&current->delays->lock,
&current->delays->compact_start,
&current->delays->compact_delay,
- &current->delays->compact_count);
+ &current->delays->compact_count,
+ &current->delays->compact_delay_max,
+ &current->delays->compact_delay_min);
}
+
+void __delayacct_wpcopy_start(void)
+{
+ current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->wpcopy_start,
+ &current->delays->wpcopy_delay,
+ &current->delays->wpcopy_count,
+ &current->delays->wpcopy_delay_max,
+ &current->delays->wpcopy_delay_min);
+}
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->delays->lock, flags);
+ task->delays->irq_delay += delta;
+ task->delays->irq_count++;
+ if (delta > task->delays->irq_delay_max)
+ task->delays->irq_delay_max = delta;
+ if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
+ task->delays->irq_delay_min = delta;
+ raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..31cfdb6b4bc3 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -8,8 +8,7 @@ config HAS_DMA
depends on !NO_DMA
default y
-config DMA_OPS
- depends on HAS_DMA
+config DMA_OPS_HELPERS
bool
#
@@ -24,6 +23,9 @@ config DMA_OPS_BYPASS
config ARCH_HAS_DMA_MAP_DIRECT
bool
+config NEED_SG_DMA_FLAGS
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -39,7 +41,7 @@ config ARCH_HAS_DMA_SET_MASK
#
# Select this option if the architecture needs special handling for
# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
-# people thing of when saying write combine, so very few platforms should
+# people think of when saying write combine, so very few platforms should
# need to enable this.
#
config ARCH_HAS_DMA_WRITE_COMBINE
@@ -76,10 +78,39 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
+#
+# Select this option if the architecture assumes DMA devices are coherent
+# by default.
+#
+config ARCH_DMA_DEFAULT_COHERENT
+ bool
+
config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config SWIOTLB_DYNAMIC
+ bool "Dynamic allocation of DMA bounce buffers"
+ default n
+ depends on SWIOTLB
+ help
+ This enables dynamic resizing of the software IO TLB. The kernel
+ starts with one memory pool at boot and it will allocate additional
+ pools as needed. To reduce run-time kernel memory requirements, you
+ may have to specify a smaller size of the initial pool using
+ "swiotlb=" on the kernel command line.
+
+ If unsure, say N.
+
+config DMA_BOUNCE_UNALIGNED_KMALLOC
+ bool
+ depends on SWIOTLB
+
+config DMA_NEED_SYNC
+ def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
+ ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || \
+ ARCH_HAS_DMA_OPS || SWIOTLB
+
config DMA_RESTRICTED_POOL
bool "DMA Restricted Pool"
depends on OF && OF_RESERVED_MEM && SWIOTLB
@@ -108,17 +139,23 @@ config DMA_COHERENT_POOL
config DMA_GLOBAL_POOL
select DMA_DECLARE_COHERENT
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
bool
-config DMA_REMAP
+config DMA_DIRECT_REMAP
bool
- depends on MMU
+ select DMA_COHERENT_POOL
select DMA_NONCOHERENT_MMAP
-config DMA_DIRECT_REMAP
+#
+# Fallback to arch code for DMA allocations. This should eventually go away.
+#
+config ARCH_HAS_DMA_ALLOC
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
+ depends on !DMA_GLOBAL_POOL
bool
- select DMA_REMAP
- select DMA_COHERENT_POOL
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
@@ -136,15 +173,16 @@ config DMA_CMA
if DMA_CMA
-config DMA_PERNUMA_CMA
- bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
- default NUMA && ARM64
+config DMA_NUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
+ depends on NUMA
help
- Enable this option to get pernuma CMA areas so that devices like
- ARM64 SMMU can get local memory by DMA coherent APIs.
+ Enable this option to get numa CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
You can set the size of pernuma CMA by specifying "cma_pernuma=size"
- on the kernel's command line.
+ or set the node id and its size of CMA by specifying "numa_cma=
+ <node>:size[,<node>:size]" on the kernel's command line.
comment "Default contiguous memory area size:"
@@ -222,23 +260,6 @@ config DMA_API_DEBUG
If unsure, say N.
-config DMA_API_DEBUG_SG
- bool "Debug DMA scatter-gather usage"
- default y
- depends on DMA_API_DEBUG
- help
- Perform extra checking that callers of dma_map_sg() have respected the
- appropriate segment length/boundary limits for the given device when
- preparing DMA scatterlists.
-
- This is particularly likely to have been overlooked in cases where the
- dma_map_sg() API is used for general bulk mapping of pages rather than
- preparing literal scatter-gather descriptors, where there is a risk of
- unexpected behaviour from DMA API implementations if the scatterlist
- is technically out-of-spec.
-
- If unsure, say N.
-
config DMA_MAP_BENCHMARK
bool "Enable benchmarking of streaming DMA mapping"
depends on DEBUG_FS
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..6977033444a3 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,12 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
-obj-$(CONFIG_DMA_OPS) += ops_helpers.o
-obj-$(CONFIG_DMA_OPS) += dummy.o
+obj-$(CONFIG_DMA_OPS_HELPERS) += ops_helpers.o
+obj-$(CONFIG_ARCH_HAS_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 375fb3c9538d..77c8d9487a9a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -74,7 +74,7 @@ out_unmap_membase:
return ERR_PTR(-ENOMEM);
}
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
{
if (!mem)
return;
@@ -126,10 +126,18 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
- dma_release_coherent_memory(mem);
+ _dma_release_coherent_memory(mem);
return ret;
}
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev) {
+ _dma_release_coherent_memory(dev->dma_mem);
+ dev->dma_mem = NULL;
+ }
+}
+
static void *__dma_alloc_from_coherent(struct device *dev,
struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
@@ -322,21 +330,28 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
#include <linux/of_reserved_mem.h>
#ifdef CONFIG_DMA_GLOBAL_POOL
-static struct reserved_mem *dma_reserved_default_memory __initdata;
+static phys_addr_t dma_reserved_default_memory_base __initdata;
+static phys_addr_t dma_reserved_default_memory_size __initdata;
#endif
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- if (!rmem->priv) {
- struct dma_coherent_mem *mem;
+ struct dma_coherent_mem *mem = rmem->priv;
+ if (!mem) {
mem = dma_init_coherent_memory(rmem->base, rmem->base,
rmem->size, true);
if (IS_ERR(mem))
return PTR_ERR(mem);
rmem->priv = mem;
}
- dma_assign_coherent_memory(dev, rmem->priv);
+
+ /* Warn if the device potentially can't use the reserved memory */
+ if (mem->device_base + rmem->size - 1 >
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit))
+ dev_warn(dev, "reserved memory is beyond device's set DMA address range\n");
+
+ dma_assign_coherent_memory(dev, mem);
return 0;
}
@@ -368,9 +383,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
#ifdef CONFIG_DMA_GLOBAL_POOL
if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
- WARN(dma_reserved_default_memory,
+ WARN(dma_reserved_default_memory_size,
"Reserved memory: region for default DMA coherent area is redefined\n");
- dma_reserved_default_memory = rmem;
+ dma_reserved_default_memory_base = rmem->base;
+ dma_reserved_default_memory_size = rmem->size;
}
#endif
@@ -383,10 +399,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
#ifdef CONFIG_DMA_GLOBAL_POOL
static int __init dma_init_reserved_memory(void)
{
- if (!dma_reserved_default_memory)
+ if (!dma_reserved_default_memory_size)
return -ENOMEM;
- return dma_init_global_coherent(dma_reserved_default_memory->base,
- dma_reserved_default_memory->size);
+ return dma_init_global_coherent(dma_reserved_default_memory_base,
+ dma_reserved_default_memory_size);
}
core_initcall(dma_init_reserved_memory);
#endif /* CONFIG_DMA_GLOBAL_POOL */
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..d8fd6f779f79 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -37,19 +37,15 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
-
#include <asm/page.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/sizes.h>
+#include <linux/dma-buf/heaps/cma.h>
#include <linux/dma-map-ops.h>
#include <linux/cma.h>
+#include <linux/nospec.h>
#ifdef CONFIG_CMA_SIZE_MBYTES
#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -69,8 +65,7 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes __initconst =
- (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
static phys_addr_t size_cmdline __initdata = -1;
static phys_addr_t base_cmdline __initdata;
static phys_addr_t limit_cmdline __initdata;
@@ -96,11 +91,44 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
static phys_addr_t pernuma_size_bytes __initdata;
+static int __init early_numa_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ numa_cma_size[nid] = tmp;
+
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else
+ break;
+ }
+
+ return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
static int __init early_cma_pernuma(char *p)
{
pernuma_size_bytes = memparse(p, &p);
@@ -127,32 +155,49 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
-#ifdef CONFIG_DMA_PERNUMA_CMA
-void __init dma_pernuma_cma_reserve(void)
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
{
int nid;
- if (!pernuma_size_bytes)
- return;
-
- for_each_online_node(nid) {
+ for_each_node(nid) {
int ret;
char name[CMA_MAX_NAME];
- struct cma **cma = &dma_contiguous_pernuma_area[nid];
-
- snprintf(name, sizeof(name), "pernuma%d", nid);
- ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
- 0, false, name, cma, nid);
- if (ret) {
- pr_warn("%s: reservation failed: err %d, node %d", __func__,
- ret, nid);
+ struct cma **cma;
+
+ if (!node_online(nid)) {
+ if (pernuma_size_bytes || numa_cma_size[nid])
+ pr_warn("invalid node %d specified\n", nid);
continue;
}
- pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
- (unsigned long long)pernuma_size_bytes / SZ_1M, nid);
+ if (pernuma_size_bytes) {
+
+ cma = &dma_contiguous_pernuma_area[nid];
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+
+ if (numa_cma_size[nid]) {
+
+ cma = &dma_contiguous_numa_area[nid];
+ snprintf(name, sizeof(name), "numa%d", nid);
+ ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+ name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
}
}
+#else
+static inline void __init dma_numa_cma_reserve(void)
+{
+}
#endif
/**
@@ -171,12 +216,17 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
phys_addr_t selected_limit = limit;
bool fixed = false;
+ dma_numa_cma_reserve();
+
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
selected_size = size_cmdline;
selected_base = base_cmdline;
- selected_limit = min_not_zero(limit_cmdline, limit);
+
+ /* Hornor the user setup dma address limit */
+ selected_limit = limit_cmdline ?: limit;
+
if (base_cmdline + size_cmdline == limit_cmdline)
fixed = true;
} else {
@@ -192,6 +242,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
}
if (selected_size && !dma_contiguous_default_area) {
+ int ret;
+
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
@@ -199,6 +251,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
selected_limit,
&dma_contiguous_default_area,
fixed);
+
+ ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
+ if (ret)
+ pr_warn("Couldn't register default CMA heap.");
}
}
@@ -303,7 +359,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
int nid = dev_to_node(dev);
#endif
@@ -315,7 +371,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return NULL;
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
struct cma *cma = dma_contiguous_pernuma_area[nid];
struct page *page;
@@ -325,6 +381,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (page)
return page;
}
+
+ cma = dma_contiguous_numa_area[nid];
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
}
#endif
if (!dma_contiguous_default_area)
@@ -356,10 +419,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
/*
* otherwise, page is from either per-numa cma or default cma
*/
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
page, count))
return;
+ if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+ page, count))
+ return;
#endif
if (cma_release(dma_contiguous_default_area, page, count))
return;
@@ -399,8 +465,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -416,7 +480,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
@@ -426,8 +490,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: unable to setup CMA region\n");
return err;
}
- /* Architecture specific contiguous memory fixup. */
- dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
dma_contiguous_default_area = cma;
@@ -438,6 +500,10 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
+ err = dma_heap_cma_register_heap(cma);
+ if (err)
+ pr_warn("Couldn't register CMA heap.");
+
return 0;
}
RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..138ede653de4 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -23,6 +23,7 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/swiotlb.h>
#include <asm/sections.h>
#include "debug.h"
@@ -38,7 +39,8 @@ enum {
dma_debug_single,
dma_debug_sg,
dma_debug_coherent,
- dma_debug_resource,
+ dma_debug_noncoherent,
+ dma_debug_phy,
};
enum map_err_types {
@@ -53,15 +55,16 @@ enum map_err_types {
* struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
* @list: node on pre-allocated free_entries list
* @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @dev_addr: dma address
* @size: length of the mapping
* @type: single, page, sg, coherent
* @direction: enum dma_data_direction
* @sg_call_ents: 'nents' from dma_map_sg
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
+ * @paddr: physical start address of the mapping
* @map_err_type: track whether dma_mapping_error() was checked
- * @stacktrace: support backtraces when a violation is detected
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
*/
struct dma_debug_entry {
struct list_head list;
@@ -72,8 +75,7 @@ struct dma_debug_entry {
int direction;
int sg_call_ents;
int sg_mapped_ents;
- unsigned long pfn;
- size_t offset;
+ phys_addr_t paddr;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
@@ -138,9 +140,10 @@ static const char *const maperr2str[] = {
static const char *type2name[] = {
[dma_debug_single] = "single",
- [dma_debug_sg] = "scather-gather",
+ [dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
- [dma_debug_resource] = "resource",
+ [dma_debug_noncoherent] = "noncoherent",
+ [dma_debug_phy] = "phy",
};
static const char *dir2name[] = {
@@ -350,11 +353,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned long *flags)
{
- unsigned int max_range = dma_get_max_seg_size(ref->dev);
struct dma_debug_entry *entry, index = *ref;
- unsigned int range = 0;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
- while (range <= max_range) {
+ for (int i = 0; i < limit; i++) {
entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
@@ -364,7 +366,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
* Nothing found, go back a hash bucket
*/
put_hash_bucket(*bucket, *flags);
- range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
}
@@ -389,45 +390,6 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
list_del(&entry->list);
}
-static unsigned long long phys_addr(struct dma_debug_entry *entry)
-{
- if (entry->type == dma_debug_resource)
- return __pfn_to_phys(entry->pfn) + entry->offset;
-
- return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
-}
-
-/*
- * Dump mapping entries for debugging purposes
- */
-void debug_dma_dump_mappings(struct device *dev)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
-
- list_for_each_entry(entry, &bucket->list, list) {
- if (!dev || dev == entry->dev) {
- dev_info(entry->dev,
- "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- }
-
- spin_unlock_irqrestore(&bucket->lock, flags);
- cond_resched();
- }
-}
-
/*
* For each mapping (initial cacheline in the case of
* dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
@@ -447,8 +409,11 @@ void debug_dma_dump_mappings(struct device *dev)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
+ *
+ * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end
+ * up right back in the DMA debugging code, leading to a deadlock.
*/
-static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN);
static DEFINE_SPINLOCK(radix_lock);
#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
@@ -456,8 +421,8 @@ static DEFINE_SPINLOCK(radix_lock);
static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
{
- return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) +
- (entry->offset >> L1_CACHE_SHIFT);
+ return ((entry->paddr >> PAGE_SHIFT) << CACHELINE_PER_PAGE_SHIFT) +
+ (offset_in_page(entry->paddr) >> L1_CACHE_SHIFT);
}
static int active_cacheline_read_overlap(phys_addr_t cln)
@@ -549,6 +514,70 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
}
/*
+ * Dump mappings entries on kernel space for debugging purposes
+ */
+void debug_dma_dump_mappings(struct device *dev)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!dev || dev == entry->dev) {
+ cln = to_cacheline_number(entry);
+ dev_info(entry->dev,
+ "%s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
+ type2name[entry->type], idx,
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+
+ cond_resched();
+ }
+}
+
+/*
+ * Dump mappings entries on user space via debugfs
+ */
+static int dump_show(struct seq_file *seq, void *v)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ cln = to_cacheline_number(entry);
+ seq_printf(seq,
+ "%s %s %s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
+ dev_driver_string(entry->dev),
+ dev_name(entry->dev),
+ type2name[entry->type], idx,
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dump);
+
+/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
@@ -564,9 +593,11 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
+ is_swiotlb_active(entry->dev))) {
err_printk(entry->dev, entry,
"cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
@@ -605,15 +636,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
-static void __dma_entry_alloc_check_leak(void)
+/*
+ * This should be called outside of free_entries_lock scope to avoid potential
+ * deadlocks with serial consoles that use DMA.
+ */
+static void __dma_entry_alloc_check_leak(u32 nr_entries)
{
- u32 tmp = nr_total_entries % nr_prealloc_entries;
+ u32 tmp = nr_entries % nr_prealloc_entries;
/* Shout each time we tick over some multiple of the initial pool */
if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
- nr_total_entries,
- (nr_total_entries / nr_prealloc_entries));
+ nr_entries,
+ (nr_entries / nr_prealloc_entries));
}
}
@@ -624,8 +659,10 @@ static void __dma_entry_alloc_check_leak(void)
*/
static struct dma_debug_entry *dma_entry_alloc(void)
{
+ bool alloc_check_leak = false;
struct dma_debug_entry *entry;
unsigned long flags;
+ u32 nr_entries;
spin_lock_irqsave(&free_entries_lock, flags);
if (num_free_entries == 0) {
@@ -635,13 +672,17 @@ static struct dma_debug_entry *dma_entry_alloc(void)
pr_err("debugging out of memory - disabling\n");
return NULL;
}
- __dma_entry_alloc_check_leak();
+ alloc_check_leak = true;
+ nr_entries = nr_total_entries;
}
entry = __dma_entry_alloc();
spin_unlock_irqrestore(&free_entries_lock, flags);
+ if (alloc_check_leak)
+ __dma_entry_alloc_check_leak(nr_entries);
+
#ifdef CONFIG_STACKTRACE
entry->stack_len = stack_trace_save(entry->stack_entries,
ARRAY_SIZE(entry->stack_entries),
@@ -766,33 +807,6 @@ static const struct file_operations filter_fops = {
.llseek = default_llseek,
};
-static int dump_show(struct seq_file *seq, void *v)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
- list_for_each_entry(entry, &bucket->list, list) {
- seq_printf(seq,
- "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n",
- dev_name(entry->dev),
- dev_driver_string(entry->dev),
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- spin_unlock_irqrestore(&bucket->lock, flags);
- }
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(dump);
-
static int __init dma_debug_fs_init(void)
{
struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
@@ -861,7 +875,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
return 0;
}
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
{
struct notifier_block *nb;
@@ -927,7 +941,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +950,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
@@ -984,16 +998,17 @@ static void check_unmap(struct dma_debug_entry *ref)
"[mapped as %s] [unmapped as %s]\n",
ref->dev_addr, ref->size,
type2name[entry->type], type2name[ref->type]);
- } else if ((entry->type == dma_debug_coherent) &&
- (phys_addr(ref) != phys_addr(entry))) {
+ } else if ((entry->type == dma_debug_coherent ||
+ entry->type == dma_debug_noncoherent) &&
+ ref->paddr != entry->paddr) {
err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
- "[cpu alloc address=0x%016llx] "
- "[cpu free address=0x%016llx]",
+ "[cpu alloc address=0x%pa] "
+ "[cpu free address=0x%pa]",
ref->dev_addr, ref->size,
- phys_addr(entry),
- phys_addr(ref));
+ &entry->paddr,
+ &ref->paddr);
}
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
@@ -1033,22 +1048,25 @@ static void check_unmap(struct dma_debug_entry *ref)
}
hash_bucket_del(entry);
- dma_entry_free(entry);
-
put_hash_bucket(bucket, flags);
+
+ /*
+ * Free the entry outside of bucket_lock to avoid ABBA deadlocks
+ * between that and radix_lock.
+ */
+ dma_entry_free(entry);
}
-static void check_for_stack(struct device *dev,
- struct page *page, size_t offset)
+static void check_for_stack(struct device *dev, phys_addr_t phys)
{
void *addr;
struct vm_struct *stack_vm_area = task_stack_vm_area(current);
if (!stack_vm_area) {
/* Stack is direct-mapped. */
- if (PageHighMem(page))
+ if (PhysHighMem(phys))
return;
- addr = page_address(page) + offset;
+ addr = phys_to_virt(phys);
if (object_is_on_stack(addr))
err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
} else {
@@ -1056,10 +1074,12 @@ static void check_for_stack(struct device *dev,
int i;
for (i = 0; i < stack_vm_area->nr_pages; i++) {
- if (page != stack_vm_area->pages[i])
+ if (__phys_to_pfn(phys) !=
+ page_to_pfn(stack_vm_area->pages[i]))
continue;
- addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+ addr = (u8 *)current->stack + i * PAGE_SIZE +
+ (phys % PAGE_SIZE);
err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
break;
}
@@ -1150,7 +1170,6 @@ out:
static void check_sg_segment(struct device *dev, struct scatterlist *sg)
{
-#ifdef CONFIG_DMA_API_DEBUG_SG
unsigned int max_seg = dma_get_max_seg_size(dev);
u64 start, end, boundary = dma_get_seg_boundary(dev);
@@ -1171,7 +1190,6 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
if ((start ^ end) & ~boundary)
err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
start, end, boundary);
-#endif
}
void debug_dma_map_single(struct device *dev, const void *addr,
@@ -1190,9 +1208,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
}
EXPORT_SYMBOL(debug_dma_map_single);
-void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr,
- unsigned long attrs)
+void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ int direction, dma_addr_t dma_addr, unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1207,20 +1224,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
return;
entry->dev = dev;
- entry->type = dma_debug_single;
- entry->pfn = page_to_pfn(page);
- entry->offset = offset;
+ entry->type = dma_debug_phy;
+ entry->paddr = phys;
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- check_for_stack(dev, page, offset);
-
- if (!PageHighMem(page)) {
- void *addr = page_address(page) + offset;
+ if (!(attrs & DMA_ATTR_MMIO)) {
+ check_for_stack(dev, phys);
- check_for_illegal_area(dev, addr, size);
+ if (!PhysHighMem(phys))
+ check_for_illegal_area(dev, phys_to_virt(phys), size);
}
add_dma_entry(entry, attrs);
@@ -1264,13 +1279,13 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL(debug_dma_mapping_error);
-void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
struct dma_debug_entry ref = {
- .type = dma_debug_single,
+ .type = dma_debug_phy,
.dev = dev,
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = direction,
};
@@ -1292,7 +1307,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
return;
for_each_sg(sg, s, nents, i) {
- check_for_stack(dev, sg_page(s), s->offset);
+ check_for_stack(dev, sg_phys(s));
if (!PageHighMem(sg_page(s)))
check_for_illegal_area(dev, sg_virt(s), s->length);
}
@@ -1304,8 +1319,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
- entry->pfn = page_to_pfn(sg_page(s));
- entry->offset = s->offset;
+ entry->paddr = sg_phys(s);
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
@@ -1351,8 +1365,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = dir,
@@ -1369,6 +1382,18 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
}
+static phys_addr_t virt_to_paddr(void *virt)
+{
+ struct page *page;
+
+ if (is_vmalloc_addr(virt))
+ page = vmalloc_to_page(virt);
+ else
+ page = virt_to_page(virt);
+
+ return page_to_phys(page) + offset_in_page(virt);
+}
+
void debug_dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t dma_addr, void *virt,
unsigned long attrs)
@@ -1391,27 +1416,21 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
entry->type = dma_debug_coherent;
entry->dev = dev;
- entry->offset = offset_in_page(virt);
+ entry->paddr = virt_to_paddr(virt);
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = DMA_BIDIRECTIONAL;
- if (is_vmalloc_addr(virt))
- entry->pfn = vmalloc_to_pfn(virt);
- else
- entry->pfn = page_to_pfn(virt_to_page(virt));
-
add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
- void *virt, dma_addr_t addr)
+ void *virt, dma_addr_t dma_addr)
{
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
- .offset = offset_in_page(virt),
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
};
@@ -1420,52 +1439,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
return;
- if (is_vmalloc_addr(virt))
- ref.pfn = vmalloc_to_pfn(virt);
- else
- ref.pfn = page_to_pfn(virt_to_page(virt));
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- check_unmap(&ref);
-}
-
-void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr,
- unsigned long attrs)
-{
- struct dma_debug_entry *entry;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- entry = dma_entry_alloc();
- if (!entry)
- return;
-
- entry->type = dma_debug_resource;
- entry->dev = dev;
- entry->pfn = PHYS_PFN(addr);
- entry->offset = offset_in_page(addr);
- entry->size = size;
- entry->dev_addr = dma_addr;
- entry->direction = direction;
- entry->map_err_type = MAP_ERR_NOT_CHECKED;
-
- add_dma_entry(entry, attrs);
-}
-
-void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
- size_t size, int direction)
-{
- struct dma_debug_entry ref = {
- .type = dma_debug_resource,
- .dev = dev,
- .dev_addr = dma_addr,
- .size = size,
- .direction = direction,
- };
+ ref.paddr = virt_to_paddr(virt);
if (unlikely(dma_debug_disabled()))
return;
@@ -1524,8 +1498,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(s),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1556,8 +1529,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .pfn = page_to_pfn(sg_page(s)),
- .offset = s->offset,
+ .paddr = sg_phys(sg),
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1573,6 +1545,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
}
}
+void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_noncoherent;
+ entry->dev = dev;
+ entry->paddr = page_to_phys(page);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_noncoherent,
+ .dev = dev,
+ .paddr = page_to_phys(page),
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+
static int __init dma_debug_driver_setup(char *str)
{
int i;
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index f525197d3cae..da7be0bddcf6 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -9,12 +9,11 @@
#define _KERNEL_DMA_DEBUG_H
#ifdef CONFIG_DMA_API_DEBUG
-extern void debug_dma_map_page(struct device *dev, struct page *page,
- size_t offset, size_t size,
- int direction, dma_addr_t dma_addr,
+extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction, dma_addr_t dma_addr,
unsigned long attrs);
-extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
@@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
-extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
- size_t size, int direction,
- dma_addr_t dma_addr,
- unsigned long attrs);
-
-extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
- size_t size, int direction);
-
extern void debug_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size,
int direction);
@@ -54,15 +45,21 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev,
extern void debug_dma_sync_sg_for_device(struct device *dev,
struct scatterlist *sg,
int nelems, int direction);
+extern void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs);
+extern void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr);
#else /* CONFIG_DMA_API_DEBUG */
-static inline void debug_dma_map_page(struct device *dev, struct page *page,
- size_t offset, size_t size,
- int direction, dma_addr_t dma_addr,
- unsigned long attrs)
+static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction,
+ dma_addr_t dma_addr, unsigned long attrs)
{
}
-static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, int direction)
{
}
@@ -90,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
{
}
-static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
- size_t size, int direction,
- dma_addr_t dma_addr,
- unsigned long attrs)
-{
-}
-
-static inline void debug_dma_unmap_resource(struct device *dev,
- dma_addr_t dma_addr, size_t size,
- int direction)
-{
-}
-
static inline void debug_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle,
size_t size, int direction)
@@ -126,5 +110,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
int nelems, int direction)
{
}
+
+static inline void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+}
#endif /* CONFIG_DMA_API_DEBUG */
#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50f48e9e4598..50c3fe2a1d55 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
#include "direct.h"
/*
@@ -20,7 +21,7 @@
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
-unsigned int zone_dma_bits __ro_after_init = 24;
+u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
@@ -44,10 +45,11 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
-static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_limit)
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
{
- u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
+ u64 dma_limit = min_not_zero(
+ dev->coherent_dma_mask,
+ dev->bus_dma_limit);
/*
* Optimistically try the zone that the physical address mask falls
@@ -58,14 +60,14 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
* zones.
*/
*phys_limit = dma_to_phys(dev, dma_limit);
- if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ if (*phys_limit <= zone_dma_limit)
return GFP_DMA;
if (*phys_limit <= DMA_BIT_MASK(32))
return GFP_DMA32;
return 0;
}
-static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
@@ -79,7 +81,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
{
if (!force_dma_unencrypted(dev))
return 0;
- return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+ return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
}
static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
@@ -88,7 +90,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
if (!force_dma_unencrypted(dev))
return 0;
- ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+ ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
if (ret)
pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
return ret;
@@ -115,10 +117,10 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
}
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- gfp_t gfp)
+ gfp_t gfp, bool allow_highmem)
{
int node = dev_to_node(dev);
- struct page *page = NULL;
+ struct page *page;
u64 phys_limit;
WARN_ON_ONCE(!PAGE_ALIGNED(size));
@@ -126,31 +128,28 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
if (is_swiotlb_for_alloc(dev))
return dma_direct_alloc_swiotlb(dev, size);
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_limit);
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ if (page) {
+ if (dma_coherent_ok(dev, page_to_phys(page), size) &&
+ (allow_highmem || !PageHighMem(page)))
+ return page;
+
dma_free_contiguous(dev, page, size);
- page = NULL;
}
-again:
- if (!page)
- page = alloc_pages_node(node, gfp, get_order(size));
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, size);
- page = NULL;
+
+ while ((page = alloc_pages_node(node, gfp, get_order(size)))
+ && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ __free_pages(page, get_order(size));
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
phys_limit < DMA_BIT_MASK(64) &&
- !(gfp & (GFP_DMA32 | GFP_DMA))) {
+ !(gfp & (GFP_DMA32 | GFP_DMA)))
gfp |= GFP_DMA32;
- goto again;
- }
-
- if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA))
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
+ else
+ return NULL;
}
return page;
@@ -169,14 +168,13 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
struct page *page;
- u64 phys_mask;
+ u64 phys_limit;
void *ret;
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
return NULL;
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
if (!page)
return NULL;
@@ -189,7 +187,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
{
struct page *page;
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
@@ -218,13 +216,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
if (!dev_is_dma_coherent(dev)) {
- /*
- * Fallback to the arch handler if it exists. This should
- * eventually go away.
- */
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
!is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp,
attrs);
@@ -238,62 +230,59 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_handle);
/*
- * Otherwise remap if the architecture is asking for it. But
- * given that remapping memory is a blocking operation we'll
- * instead have to dip into the atomic pools.
+ * Otherwise we require the architecture to either be able to
+ * mark arbitrary parts of the kernel direct mapping uncached,
+ * or remapped it uncached.
*/
+ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
- if (remap) {
- if (dma_direct_use_pool(dev, gfp))
- return dma_direct_alloc_from_pool(dev, size,
- dma_handle, gfp);
- } else {
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
- return NULL;
- set_uncached = true;
+ if (!set_uncached && !remap) {
+ pr_warn_once("coherent DMA allocations not supported on this platform.\n");
+ return NULL;
}
}
/*
- * Decrypting memory may block, so allocate the memory from the atomic
- * pools if we can't block.
+ * Remapping or decrypting memory may block, allocate the memory from
+ * the atomic pools instead if we aren't allowed block.
*/
- if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+ if ((remap || force_dma_unencrypted(dev)) &&
+ dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
+
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup,
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here, so
- * log an error and fail.
- */
- if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
remap = true;
set_uncached = false;
}
if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
} else {
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
}
memset(ret, 0, size);
@@ -314,6 +303,8 @@ out_encrypt_pages:
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
+out_leak_pages:
+ return NULL;
}
void dma_direct_free(struct device *dev, size_t size,
@@ -328,9 +319,7 @@ void dma_direct_free(struct device *dev, size_t size,
return;
}
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
!dev_is_dma_coherent(dev) &&
!is_swiotlb_for_alloc(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
@@ -349,12 +338,12 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ if (dma_set_encrypted(dev, cpu_addr, size))
return;
}
@@ -370,28 +359,17 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
- page = __dma_direct_alloc_pages(dev, size, gfp);
+ page = __dma_direct_alloc_pages(dev, size, gfp, false);
if (!page)
return NULL;
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
-out_free_pages:
- __dma_direct_free_pages(dev, page, size);
+out_leak_pages:
return NULL;
}
@@ -399,7 +377,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
- unsigned int page_order = get_order(size);
void *vaddr = page_address(page);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -407,7 +384,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ if (dma_set_encrypted(dev, vaddr, size))
return;
__dma_direct_free_pages(dev, page, size);
}
@@ -423,9 +400,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, sg->length,
@@ -449,9 +424,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, sg->length);
@@ -461,29 +434,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_dma_is_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_phys(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+ }
}
#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
+ int i, ret;
for_each_sg(sgl, sg, nents, i) {
- sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
- sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR)
+ switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI host bridge
+ * must be mapped with CPU physical address and not PCI
+ * bus addresses.
+ */
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+ sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ sg->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(sg));
+ sg_dma_len(sg) = sg->length;
+ sg_dma_mark_bus_address(sg);
+ continue;
+ default:
+ ret = -EREMOTEIO;
goto out_unmap;
+ }
sg_dma_len(sg) = sg->length;
}
@@ -491,23 +495,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return -EIO;
-}
-
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- dma_addr_t dma_addr = paddr;
-
- if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
- dev_err_once(dev,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- WARN_ON_ONCE(1);
- return DMA_MAPPING_ERROR;
- }
-
- return dma_addr;
+ return ret;
}
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
@@ -539,6 +527,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
@@ -570,10 +560,58 @@ int dma_direct_supported(struct device *dev, u64 mask)
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
- min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
+ min_mask = min_t(u64, min_mask, zone_dma_limit);
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
+static const struct bus_dma_region *dma_find_range(struct device *dev,
+ unsigned long start_pfn)
+{
+ const struct bus_dma_region *m;
+
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size))
+ return m;
+ }
+
+ return NULL;
+}
+
+/*
+ * To check whether all ram resource ranges are covered by dma range map
+ * Returns 0 when further check is needed
+ * Returns 1 if there is some RAM range can't be covered by dma_range_map
+ */
+static int check_ram_in_range_map(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ struct device *dev = data;
+
+ while (start_pfn < end_pfn) {
+ const struct bus_dma_region *bdr;
+
+ bdr = dma_find_range(dev, start_pfn);
+ if (!bdr)
+ return 1;
+
+ start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size);
+ }
+
+ return 0;
+}
+
+bool dma_direct_all_ram_mapped(struct device *dev)
+{
+ if (!dev->dma_range_map)
+ return true;
+ return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev,
+ check_ram_in_range_map);
+}
+
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
@@ -586,7 +624,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
+ swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr));
}
/**
@@ -624,7 +662,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
return -ENOMEM;
map[0].cpu_start = cpu_start;
map[0].dma_start = dma_start;
- map[0].offset = offset;
map[0].size = size;
dev->dma_range_map = map;
return 0;
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72e..da2fadf45bcd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -8,6 +8,7 @@
#define _KERNEL_DMA_DIRECT_H
#include <linux/dma-direct.h>
+#include <linux/memremap.h>
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -19,6 +20,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs);
+bool dma_direct_all_ram_mapped(struct device *dev);
size_t dma_direct_max_mapping_size(struct device *dev);
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
@@ -56,8 +58,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, size, dir);
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, size, dir);
@@ -73,47 +74,67 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, size);
}
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
- struct page *page, unsigned long offset, size_t size,
- enum dma_data_direction dir, unsigned long attrs)
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dma_addr = phys_to_dma(dev, phys);
+ dma_addr_t dma_addr;
- if (is_swiotlb_force_bounce(dev))
- return swiotlb_map(dev, phys, size, dir, attrs);
+ if (is_swiotlb_force_bounce(dev)) {
+ if (attrs & DMA_ATTR_MMIO)
+ goto err_overflow;
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
- return swiotlb_map(dev, phys, size, dir, attrs);
+ return swiotlb_map(dev, phys, size, dir, attrs);
+ }
- dev_WARN_ONCE(dev, 1,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- return DMA_MAPPING_ERROR;
+ if (attrs & DMA_ATTR_MMIO) {
+ dma_addr = phys;
+ if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ goto err_overflow;
+ } else {
+ dma_addr = phys_to_dma(dev, phys);
+ if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
+ if (is_swiotlb_active(dev))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ goto err_overflow;
+ }
}
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ if (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
+
+err_overflow:
+ dev_WARN_ONCE(
+ dev, 1,
+ "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
}
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- phys_addr_t phys = dma_to_phys(dev, addr);
+ phys_addr_t phys;
+
+ if (attrs & DMA_ATTR_MMIO)
+ /* nothing to do: uncached and no swiotlb */
+ return;
+ phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index b492d59ac77e..16a51736a2a3 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -11,12 +11,20 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
}
-static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
{
return DMA_MAPPING_ERROR;
}
+static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_phys, so unmap_page should never be
+ * called.
+ */
+ WARN_ON_ONCE(true);
+}
static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
int nelems, enum dma_data_direction dir,
@@ -25,6 +33,16 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
return -EINVAL;
}
+static void dma_dummy_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_sg, so unmap_sg should never be called.
+ */
+ WARN_ON_ONCE(true);
+}
+
static int dma_dummy_supported(struct device *hwdev, u64 mask)
{
return 0;
@@ -32,7 +50,9 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask)
const struct dma_map_ops dma_dummy_ops = {
.mmap = dma_dummy_mmap,
- .map_page = dma_dummy_map_page,
+ .map_phys = dma_dummy_map_phys,
+ .unmap_phys = dma_dummy_unmap_phys,
.map_sg = dma_dummy_map_sg,
+ .unmap_sg = dma_dummy_unmap_sg,
.dma_supported = dma_dummy_supported,
};
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..794041a39e65 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -17,30 +17,7 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
-
-#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS 1024
-#define DMA_MAP_MAX_SECONDS 300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL 0
-#define DMA_MAP_TO_DEVICE 1
-#define DMA_MAP_FROM_DEVICE 2
-
-struct map_benchmark {
- __u64 avg_map_100ns; /* average map latency in 100ns */
- __u64 map_stddev; /* standard deviation of map latency */
- __u64 avg_unmap_100ns; /* as above */
- __u64 unmap_stddev;
- __u32 threads; /* how many threads will do map/unmap in parallel */
- __u32 seconds; /* how long the test will last */
- __s32 node; /* which numa node this benchmark will run on */
- __u32 dma_bits; /* DMA addressing capability */
- __u32 dma_dir; /* DMA data direction */
- __u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
- __u8 expansion[76]; /* For future use */
-};
+#include <uapi/linux/map_benchmark.h>
struct map_benchmark_data {
struct map_benchmark bparam;
@@ -112,6 +89,22 @@ static int map_benchmark_thread(void *data)
atomic64_add(map_sq, &map->sum_sq_map);
atomic64_add(unmap_sq, &map->sum_sq_unmap);
atomic64_inc(&map->loops);
+
+ /*
+ * We may test for a long time so periodically check whether
+ * we need to schedule to avoid starving the others. Otherwise
+ * we may hangup the kernel in a non-preemptible kernel when
+ * the test kthreads number >= CPU number, the test kthreads
+ * will run endless on every CPU since the thread resposible
+ * for notifying the kthread stop (in do_map_benchmark())
+ * could not be scheduled.
+ *
+ * Note this may degrade the test concurrency since the test
+ * threads may need to share the CPU time with other load
+ * in the system. So it's recommended to run this benchmark
+ * on an idle system.
+ */
+ cond_resched();
}
out:
@@ -124,7 +117,6 @@ static int do_map_benchmark(struct map_benchmark_data *map)
struct task_struct **tsk;
int threads = map->bparam.threads;
int node = map->bparam.node;
- const cpumask_t *cpu_mask = cpumask_of_node(node);
u64 loops;
int ret = 0;
int i;
@@ -141,11 +133,13 @@ static int do_map_benchmark(struct map_benchmark_data *map)
if (IS_ERR(tsk[i])) {
pr_err("create dma_map thread failed\n");
ret = PTR_ERR(tsk[i]);
+ while (--i >= 0)
+ kthread_stop(tsk[i]);
goto out;
}
if (node != NUMA_NO_NODE)
- kthread_bind_mask(tsk[i], cpu_mask);
+ kthread_bind_mask(tsk[i], cpumask_of_node(node));
}
/* clear the old value in the previous benchmark */
@@ -162,13 +156,17 @@ static int do_map_benchmark(struct map_benchmark_data *map)
msleep_interruptible(map->bparam.seconds * 1000);
- /* wait for the completion of benchmark threads */
+ /* wait for the completion of all started benchmark threads */
for (i = 0; i < threads; i++) {
- ret = kthread_stop(tsk[i]);
- if (ret)
- goto out;
+ int kthread_ret = kthread_stop_put(tsk[i]);
+
+ if (kthread_ret)
+ ret = kthread_ret;
}
+ if (ret)
+ goto out;
+
loops = atomic64_read(&map->loops);
if (likely(loops > 0)) {
u64 map_variance, unmap_variance;
@@ -193,8 +191,6 @@ static int do_map_benchmark(struct map_benchmark_data *map)
}
out:
- for (i = 0; i < threads; i++)
- put_task_struct(tsk[i]);
put_device(map->dev);
kfree(tsk);
return ret;
@@ -231,7 +227,8 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
}
if (map->bparam.node != NUMA_NO_NODE &&
- !node_possible(map->bparam.node)) {
+ (map->bparam.node < 0 || map->bparam.node >= MAX_NUMNODES ||
+ !node_possible(map->bparam.node))) {
pr_err("invalid numa node\n");
return -EINVAL;
}
@@ -275,6 +272,9 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
* dma_mask changed by benchmark
*/
dma_set_mask(map->dev, old_dma_mask);
+
+ if (ret)
+ return ret;
break;
default:
return -EINVAL;
@@ -379,4 +379,3 @@ module_exit(map_benchmark_cleanup);
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
MODULE_DESCRIPTION("dma_map benchmark driver");
-MODULE_LICENSE("GPL");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..37163eb49f9f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,13 +10,22 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/iommu-dma.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include "debug.h"
#include "direct.h"
-bool dma_default_coherent;
+#define CREATE_TRACE_POINTS
+#include <trace/events/dma.h>
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT);
+#endif
/*
* Managed DMA API
@@ -62,8 +71,8 @@ void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
{
struct dma_devres match_data = { size, vaddr, dma_handle };
- dma_free_coherent(dev, size, vaddr, dma_handle);
WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data));
+ dma_free_coherent(dev, size, vaddr, dma_handle);
}
EXPORT_SYMBOL(dmam_free_coherent);
@@ -111,8 +120,12 @@ EXPORT_SYMBOL(dmam_alloc_attrs);
static bool dma_go_direct(struct device *dev, dma_addr_t mask,
const struct dma_map_ops *ops)
{
+ if (use_dma_iommu(dev))
+ return false;
+
if (likely(!ops))
return true;
+
#ifdef CONFIG_DMA_OPS_BYPASS
if (dev->dma_ops_bypass)
return min_not_zero(mask, dev->bus_dma_limit) >=
@@ -139,12 +152,12 @@ static inline bool dma_map_direct(struct device *dev,
return dma_go_direct(dev, *dev->dma_mask, ops);
}
-dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
- size_t offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- dma_addr_t addr;
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
+ dma_addr_t addr = DMA_MAPPING_ERROR;
BUG_ON(!valid_dma_direction(dir));
@@ -152,28 +165,65 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
return DMA_MAPPING_ERROR;
if (dma_map_direct(dev, ops) ||
- arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
- addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
- else
- addr = ops->map_page(dev, page, offset, size, dir, attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
+ (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
+ addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
+ else if (ops->map_phys)
+ addr = ops->map_phys(dev, phys, size, dir, attrs);
+
+ if (!is_mmio)
+ kmsan_handle_dma(phys, size, dir);
+ trace_dma_map_phys(dev, phys, addr, size, dir, attrs);
+ debug_dma_map_phys(dev, phys, size, dir, addr, attrs);
return addr;
}
+EXPORT_SYMBOL_GPL(dma_map_phys);
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
+
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(is_zone_device_page(page)))
+ return DMA_MAPPING_ERROR;
+
+ return dma_map_phys(dev, phys, size, dir, attrs);
+}
EXPORT_SYMBOL(dma_map_page_attrs);
-void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops) ||
- arch_dma_unmap_page_direct(dev, addr + size))
- dma_direct_unmap_page(dev, addr, size, dir, attrs);
- else if (ops->unmap_page)
- ops->unmap_page(dev, addr, size, dir, attrs);
- debug_dma_unmap_page(dev, addr, size, dir);
+ (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, addr, size, dir, attrs);
+ trace_dma_unmap_phys(dev, addr, size, dir, attrs);
+ debug_dma_unmap_phys(dev, addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(dma_unmap_phys);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return;
+
+ dma_unmap_phys(dev, addr, size, dir, attrs);
}
EXPORT_SYMBOL(dma_unmap_page_attrs);
@@ -191,14 +241,20 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
if (dma_map_direct(dev, ops) ||
arch_dma_map_sg_direct(dev, sg, nents))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- if (ents > 0)
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
+ trace_dma_map_sg(dev, sg, nents, ents, dir, attrs);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
- else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO))
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
+ trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs);
return -EIO;
+ }
return ents;
}
@@ -249,12 +305,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs);
* Returns 0 on success or a negative error code on error. The following
* error codes are supported with the given meaning:
*
- * -EINVAL An invalid argument, unaligned access or other error
- * in usage. Will not succeed if retried.
- * -ENOMEM Insufficient resources (like memory or IOVA space) to
- * complete the mapping. Should succeed if retried later.
- * -EIO Legacy error code with an unknown meaning. eg. this is
- * returned if a lower level call returned DMA_MAPPING_ERROR.
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
*/
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
enum dma_data_direction dir, unsigned long attrs)
@@ -276,10 +335,13 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
+ trace_dma_unmap_sg(dev, sg, nents, dir, attrs);
debug_dma_unmap_sg(dev, sg, nents, dir);
if (dma_map_direct(dev, ops) ||
arch_dma_unmap_sg_direct(dev, sg, nents))
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_sg(dev, sg, nents, dir, attrs);
else if (ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
}
@@ -288,37 +350,23 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs);
dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- dma_addr_t addr = DMA_MAPPING_ERROR;
-
- BUG_ON(!valid_dma_direction(dir));
-
- if (WARN_ON_ONCE(!dev->dma_mask))
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
return DMA_MAPPING_ERROR;
- if (dma_map_direct(dev, ops))
- addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
- else if (ops->map_resource)
- addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
- debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
- return addr;
+ return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_map_resource);
void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (!dma_map_direct(dev, ops) && ops->unmap_resource)
- ops->unmap_resource(dev, addr, size, dir, attrs);
- debug_dma_unmap_resource(dev, addr, size, dir);
+ dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_unmap_resource);
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+#ifdef CONFIG_DMA_NEED_SYNC
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -326,13 +374,16 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
else if (ops->sync_single_for_cpu)
ops->sync_single_for_cpu(dev, addr, size, dir);
+ trace_dma_sync_single_for_cpu(dev, addr, size, dir);
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
}
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
+EXPORT_SYMBOL(__dma_sync_single_for_cpu);
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -340,13 +391,16 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_device(dev, addr, size, dir);
else if (ops->sync_single_for_device)
ops->sync_single_for_device(dev, addr, size, dir);
+ trace_dma_sync_single_for_device(dev, addr, size, dir);
debug_dma_sync_single_for_device(dev, addr, size, dir);
}
-EXPORT_SYMBOL(dma_sync_single_for_device);
+EXPORT_SYMBOL(__dma_sync_single_for_device);
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
int nelems, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -354,13 +408,16 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
else if (ops->sync_sg_for_cpu)
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
int nelems, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -368,11 +425,72 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_device(dev, sg, nelems, dir);
else if (ops->sync_sg_for_device)
ops->sync_sg_for_device(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_device(dev, sg, nelems, dir);
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
}
-EXPORT_SYMBOL(dma_sync_sg_for_device);
+EXPORT_SYMBOL(__dma_sync_sg_for_device);
+
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops))
+ /*
+ * dma_skip_sync could've been reset on first SWIOTLB buffer
+ * mapping, but @dma_addr is not necessary an SWIOTLB buffer.
+ * In this case, fall back to more granular check.
+ */
+ return dma_direct_need_sync(dev, dma_addr);
+ return true;
+}
+EXPORT_SYMBOL_GPL(__dma_need_sync);
+
+/**
+ * dma_need_unmap - does this device need dma_unmap_* operations
+ * @dev: device to check
+ *
+ * If this function returns %false, drivers can skip calling dma_unmap_* after
+ * finishing an I/O. This function must be called after all mappings that might
+ * need to be unmapped have been performed.
+ */
+bool dma_need_unmap(struct device *dev)
+{
+ if (!dma_map_direct(dev, get_dma_ops(dev)))
+ return true;
+ if (!dev->dma_skip_sync)
+ return true;
+ return IS_ENABLED(CONFIG_DMA_API_DEBUG);
+}
+EXPORT_SYMBOL_GPL(dma_need_unmap);
+
+static void dma_setup_need_sync(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) || use_dma_iommu(dev))
+ /*
+ * dma_skip_sync will be reset to %false on first SWIOTLB buffer
+ * mapping, if any. During the device initialization, it's
+ * enough to check only for the DMA coherence.
+ */
+ dev->dma_skip_sync = dev_is_dma_coherent(dev);
+ else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
+ !ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
+ /*
+ * Synchronization is not possible when none of DMA sync ops
+ * is set.
+ */
+ dev->dma_skip_sync = true;
+ else
+ dev->dma_skip_sync = false;
+}
+#else /* !CONFIG_DMA_NEED_SYNC */
+static inline void dma_setup_need_sync(struct device *dev) { }
+#endif /* !CONFIG_DMA_NEED_SYNC */
/*
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -394,6 +512,9 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
if (dma_alloc_direct(dev, ops))
return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
if (!ops->get_sgtable)
return -ENXIO;
return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
@@ -407,8 +528,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
@@ -432,6 +551,8 @@ bool dma_can_mmap(struct device *dev)
if (dma_alloc_direct(dev, ops))
return dma_direct_can_mmap(dev);
+ if (use_dma_iommu(dev))
+ return true;
return ops->mmap != NULL;
}
EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -458,6 +579,9 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
if (dma_alloc_direct(dev, ops))
return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
if (!ops->mmap)
return -ENXIO;
return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
@@ -470,6 +594,10 @@ u64 dma_get_required_mask(struct device *dev)
if (dma_alloc_direct(dev, ops))
return dma_direct_get_required_mask(dev);
+
+ if (use_dma_iommu(dev))
+ return DMA_BIT_MASK(32);
+
if (ops->get_required_mask)
return ops->get_required_mask(dev);
@@ -493,19 +621,37 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
- if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) {
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size,
+ DMA_BIDIRECTIONAL, flag, attrs);
return cpu_addr;
+ }
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_alloc_direct(dev, ops))
+ if (dma_alloc_direct(dev, ops)) {
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
- else if (ops->alloc)
+ } else if (use_dma_iommu(dev)) {
+ cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
+ } else if (ops->alloc) {
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
- else
+ } else {
+ trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag,
+ attrs);
return NULL;
+ }
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL,
+ flag, attrs);
debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
@@ -527,12 +673,16 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
*/
WARN_ON(irqs_disabled());
+ trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL,
+ attrs);
if (!cpu_addr)
return;
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs);
else if (ops->free)
ops->free(dev, size, cpu_addr, dma_handle, attrs);
}
@@ -547,13 +697,17 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
return NULL;
if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (!ops->alloc_pages)
+ if (use_dma_iommu(dev))
+ return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (!ops->alloc_pages_op)
return NULL;
- return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+ return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
}
struct page *dma_alloc_pages(struct device *dev, size_t size,
@@ -561,8 +715,13 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
{
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (page)
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ if (page) {
+ trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
+ size, dir, gfp, 0);
+ debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0);
+ } else {
+ trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
+ }
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -575,6 +734,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
dma_direct_free_pages(dev, size, page, dma_handle, dir);
+ else if (use_dma_iommu(dev))
+ dma_common_free_pages(dev, size, page, dma_handle, dir);
else if (ops->free_pages)
ops->free_pages(dev, size, page, dma_handle, dir);
}
@@ -582,7 +743,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
void dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
- debug_dma_unmap_page(dev, dma_handle, size, dir);
+ trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
+ debug_dma_free_pages(dev, page, size, dir, dma_handle);
__dma_free_pages(dev, size, page, dma_handle, dir);
}
EXPORT_SYMBOL_GPL(dma_free_pages);
@@ -627,20 +789,24 @@ out_free_sgt:
struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
struct sg_table *sgt;
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
- if (ops && ops->alloc_noncontiguous)
- sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+ if (use_dma_iommu(dev))
+ sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs);
else
sgt = alloc_single_sgt(dev, size, dir, gfp);
if (sgt) {
sgt->nents = 1;
+ trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs);
debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ } else {
+ trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs);
}
return sgt;
}
@@ -658,11 +824,11 @@ static void free_single_sgt(struct device *dev, size_t size,
void dma_free_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt, enum dma_data_direction dir)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
+ trace_dma_free_sgt(dev, sgt, size, dir);
debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
- if (ops && ops->free_noncontiguous)
- ops->free_noncontiguous(dev, size, sgt, dir);
+
+ if (use_dma_iommu(dev))
+ iommu_dma_free_noncontiguous(dev, size, sgt, dir);
else
free_single_sgt(dev, size, sgt, dir);
}
@@ -671,62 +837,67 @@ EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
void *dma_vmap_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- if (ops && ops->alloc_noncontiguous)
- return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+ if (use_dma_iommu(dev))
+ return iommu_dma_vmap_noncontiguous(dev, size, sgt);
+
return page_address(sg_page(sgt->sgl));
}
EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (ops && ops->alloc_noncontiguous)
- vunmap(vaddr);
+ if (use_dma_iommu(dev))
+ iommu_dma_vunmap_noncontiguous(dev, vaddr);
}
EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
size_t size, struct sg_table *sgt)
{
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt);
+ return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
+static int dma_supported(struct device *dev, u64 mask)
+{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (ops && ops->alloc_noncontiguous) {
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ if (use_dma_iommu(dev)) {
+ if (WARN_ON(ops))
+ return false;
+ return true;
+ }
- if (vma->vm_pgoff >= count ||
- vma_pages(vma) > count - vma->vm_pgoff)
- return -ENXIO;
- return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+ /*
+ * ->dma_supported sets and clears the bypass flag, so ignore it here
+ * and always call into the method if there is one.
+ */
+ if (ops) {
+ if (!ops->dma_supported)
+ return true;
+ return ops->dma_supported(dev, mask);
}
- return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+
+ return dma_direct_supported(dev, mask);
}
-EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
-int dma_supported(struct device *dev, u64 mask)
+bool dma_pci_p2pdma_supported(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
/*
- * ->dma_supported sets the bypass flag, so we must always call
- * into the method here unless the device is truly direct mapped.
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
*/
- if (!ops)
- return dma_direct_supported(dev, mask);
- if (!ops->dma_supported)
- return 1;
- return ops->dma_supported(dev, mask);
-}
-EXPORT_SYMBOL(dma_supported);
-#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
-void arch_dma_set_mask(struct device *dev, u64 mask);
-#else
-#define arch_dma_set_mask(dev, mask) do { } while (0)
-#endif
+ /* if ops is not set, dma direct and default IOMMU support P2PDMA */
+ return !ops;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
int dma_set_mask(struct device *dev, u64 mask)
{
@@ -741,11 +912,12 @@ int dma_set_mask(struct device *dev, u64 mask)
arch_dma_set_mask(dev, mask);
*dev->dma_mask = mask;
+ dma_setup_need_sync(dev);
+
return 0;
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -761,7 +933,37 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
+
+static bool __dma_addressing_limited(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
+
+ if (unlikely(ops) || use_dma_iommu(dev))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
+}
+
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev: device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false. Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
+{
+ if (!__dma_addressing_limited(dev))
+ return false;
+
+ dev_dbg(dev, "device is DMA addressing limited\n");
+ return true;
+}
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
size_t dma_max_mapping_size(struct device *dev)
{
@@ -770,6 +972,8 @@ size_t dma_max_mapping_size(struct device *dev)
if (dma_map_direct(dev, ops))
size = dma_direct_max_mapping_size(dev);
+ else if (use_dma_iommu(dev))
+ size = iommu_dma_max_mapping_size(dev);
else if (ops && ops->max_mapping_size)
size = ops->max_mapping_size(dev);
@@ -777,20 +981,27 @@ size_t dma_max_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+size_t dma_opt_mapping_size(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
- if (dma_map_direct(dev, ops))
- return dma_direct_need_sync(dev, dma_addr);
- return ops->sync_single_for_cpu || ops->sync_single_for_device;
+ if (use_dma_iommu(dev))
+ size = iommu_dma_opt_mapping_size();
+ else if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
}
-EXPORT_SYMBOL_GPL(dma_need_sync);
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
unsigned long dma_get_merge_boundary(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_merge_boundary(dev);
+
if (!ops || !ops->get_merge_boundary)
return 0; /* can't merge */
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index af4a6ef48ce0..20caf9cabf69 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -4,6 +4,7 @@
* the allocated memory contains normal pages in the direct kernel mapping.
*/
#include <linux/dma-map-ops.h>
+#include <linux/iommu-dma.h>
static struct page *dma_common_vaddr_to_page(void *cpu_addr)
{
@@ -63,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
struct page *page;
+ phys_addr_t phys;
page = dma_alloc_contiguous(dev, size, gfp);
if (!page)
@@ -70,8 +72,13 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
- *dma_handle = ops->map_page(dev, page, 0, size, dir,
- DMA_ATTR_SKIP_CPU_SYNC);
+ phys = page_to_phys(page);
+ if (use_dma_iommu(dev))
+ *dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else
+ *dma_handle = ops->map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
if (*dma_handle == DMA_MAPPING_ERROR) {
dma_free_contiguous(dev, page, size);
return NULL;
@@ -86,8 +93,11 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (ops->unmap_page)
- ops->unmap_page(dev, dma_handle, size, dir,
+ if (use_dma_iommu(dev))
+ iommu_dma_unmap_phys(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
dma_free_contiguous(dev, page, size);
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..ee45dee33d49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
/* CMA can't cross zone boundaries, see cma_activate_area() */
end = cma_get_base(cma) + size - 1;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
- return end <= DMA_BIT_MASK(zone_dma_bits);
+ return end <= zone_dma_limit;
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
- return end <= DMA_BIT_MASK(32);
+ return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
return true;
}
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER-1 */
- order = min(get_order(pool_size), MAX_ORDER-1);
+ /* Cannot allocate larger than MAX_PAGE_ORDER */
+ order = min(get_order(pool_size), MAX_PAGE_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
- pgprot_dmacoherent(PAGE_KERNEL),
- __builtin_return_address(0));
+ pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+ __builtin_return_address(0));
if (!addr)
goto free_page;
#else
@@ -135,9 +135,9 @@ encrypt_mapping:
remove_mapping:
#ifdef CONFIG_DMA_DIRECT_REMAP
dma_common_free_remap(addr, pool_size);
-#endif
-free_page: __maybe_unused
+free_page:
__free_pages(page, order);
+#endif
out:
return ret;
}
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index b4526668072e..b7c1c0c92d0c 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -10,8 +10,10 @@ struct page **dma_common_find_pages(void *cpu_addr)
{
struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || area->flags != VM_DMA_COHERENT)
+ if (!area || !(area->flags & VM_DMA_COHERENT))
return NULL;
+ WARN(area->flags != VM_DMA_COHERENT,
+ "unexpected flags in area: %p\n", cpu_addr);
return area->pages;
}
@@ -43,13 +45,13 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
void *vaddr;
int i;
- pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
+ pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
for (i = 0; i < count; i++)
- pages[i] = nth_page(page, i);
+ pages[i] = page++;
vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
- kfree(pages);
+ kvfree(pages);
return vaddr;
}
@@ -61,7 +63,7 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
{
struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || area->flags != VM_DMA_COHERENT) {
+ if (!area || !(area->flags & VM_DMA_COHERENT)) {
WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
return;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..a547c7693135 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,40 +21,34 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
-#include <linux/mm.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/rculist.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
@@ -67,21 +61,123 @@
*/
#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+/**
+ * struct io_tlb_slot - IO TLB slot descriptor
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size: Size of the allocated buffer.
+ * @list: The free list describing the number of free entries available
+ * from each index.
+ * @pad_slots: Number of preceding padding slots. Valid only in the first
+ * allocated non-padding slot.
+ */
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned short list;
+ unsigned short pad_slots;
+};
+
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+static void swiotlb_dyn_alloc(struct work_struct *work);
+
+static struct io_tlb_mem io_tlb_default_mem = {
+ .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
+ .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
+ .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
+ swiotlb_dyn_alloc),
+};
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
-enum swiotlb_force swiotlb_force;
+static struct io_tlb_mem io_tlb_default_mem;
-struct io_tlb_mem io_tlb_default_mem;
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
-phys_addr_t swiotlb_unencrypted_base;
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
+ */
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
+
+/**
+ * swiotlb_adjust_nareas() - adjust the number of areas and slots
+ * @nareas: Desired number of areas. Zero is treated as 1.
+ *
+ * Adjust the default number of areas in a memory pool.
+ * The default size of the memory pool may also change to meet minimum area
+ * size requirements.
*/
-static unsigned int max_segment;
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
-static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
+
+/**
+ * limit_nareas() - get the maximum number of areas for a given memory pool size
+ * @nareas: Desired number of areas.
+ * @nslots: Total number of slots in the memory pool.
+ *
+ * Limit the number of areas to the maximum possible number of areas in
+ * a memory pool of the given size.
+ *
+ * Return: Maximum possible number of areas.
+ */
+static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
+{
+ if (nslots < nareas * IO_TLB_SEGSIZE)
+ return nslots / IO_TLB_SEGSIZE;
+ return nareas;
+}
static int __init
setup_io_tlb_npages(char *str)
@@ -93,29 +189,19 @@ setup_io_tlb_npages(char *str)
}
if (*str == ',')
++str;
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
if (!strcmp(str, "force"))
- swiotlb_force = SWIOTLB_FORCE;
+ swiotlb_force_bounce = true;
else if (!strcmp(str, "noforce"))
- swiotlb_force = SWIOTLB_NO_FORCE;
+ swiotlb_force_disable = true;
return 0;
}
early_param("swiotlb", setup_io_tlb_npages);
-unsigned int swiotlb_max_segment(void)
-{
- return io_tlb_default_mem.nslabs ? max_segment : 0;
-}
-EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-
-void swiotlb_set_max_segment(unsigned int val)
-{
- if (swiotlb_force == SWIOTLB_FORCE)
- max_segment = 1;
- else
- max_segment = rounddown(val, PAGE_SIZE);
-}
-
unsigned long swiotlb_size_or_default(void)
{
return default_nslabs << IO_TLB_SHIFT;
@@ -130,14 +216,17 @@ void __init swiotlb_adjust_size(unsigned long size)
*/
if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
return;
+
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
if (!mem->nslabs) {
pr_warn("No low mem\n");
@@ -159,34 +248,6 @@ static inline unsigned long nr_slots(u64 val)
}
/*
- * Remap swioltb memory in the unencrypted physical address space
- * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
- * Isolation VMs).
- */
-#ifdef CONFIG_HAS_IOMEM
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
- void *vaddr = NULL;
-
- if (swiotlb_unencrypted_base) {
- phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
-
- vaddr = memremap(paddr, bytes, MEMREMAP_WB);
- if (!vaddr)
- pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
- &paddr, bytes);
- }
-
- return vaddr;
-}
-#else
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
- return NULL;
-}
-#endif
-
-/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
* call SWIOTLB when the operations are possible. It needs to be called
@@ -194,25 +255,17 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
- void *vaddr;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long bytes;
if (!mem->nslabs || mem->late_alloc)
return;
- vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
-
- mem->vaddr = swiotlb_mem_remap(mem, bytes);
- if (!mem->vaddr)
- mem->vaddr = vaddr;
-
- memset(mem->vaddr, 0, bytes);
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
-static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, bool late_alloc)
+static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc, unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -220,82 +273,151 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->nslabs = nslabs;
mem->start = start;
mem->end = mem->start + bytes;
- mem->index = 0;
mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
- if (swiotlb_force == SWIOTLB_FORCE)
- mem->force_bounce = true;
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
- spin_lock_init(&mem->lock);
for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+ mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
+ mem->nslabs - i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
}
- /*
- * If swiotlb_unencrypted_base is set, the bounce buffer memory will
- * be remapped and cleared in swiotlb_update_mem_attributes.
- */
- if (swiotlb_unencrypted_base)
- return;
-
memset(vaddr, 0, bytes);
mem->vaddr = vaddr;
return;
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+/**
+ * add_mem_pool() - add a memory pool to the allocator
+ * @mem: Software IO TLB allocator.
+ * @pool: Memory pool to be added.
+ */
+static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
- size_t alloc_size;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock(&mem->lock);
+ list_add_rcu(&pool->node, &mem->pools);
+ mem->nslabs += pool->nslabs;
+ spin_unlock(&mem->lock);
+#else
+ mem->nslabs = pool->nslabs;
+#endif
+}
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
- /* protect against double initialization */
- if (WARN_ON_ONCE(mem->nslabs))
- return -ENOMEM;
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
- mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem->slots)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
- if (verbose)
- swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
- return 0;
+ return tlb;
}
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
*/
-void __init
-swiotlb_init(int verbose)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs;
+ unsigned int nareas;
+ size_t alloc_size;
void *tlb;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
return;
- /* Get IO TLB memory from the low pages */
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb)
- goto fail;
- if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
- goto fail_free_mem;
- return;
+ io_tlb_default_mem.force_bounce =
+ swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (flags & SWIOTLB_ANY)
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+ else
+ io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
+#endif
-fail_free_mem:
- memblock_free(tlb, bytes);
-fail:
- pr_warn("Cannot allocate buffer");
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ nslabs = default_nslabs;
+ nareas = limit_nareas(default_nareas, nslabs);
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
+ }
+
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+ return;
+ }
+
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ nareas), SMP_CACHE_BYTES);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
+
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
+
+ if (flags & SWIOTLB_VERBOSE)
+ swiotlb_print_info();
+}
+
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+ swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -303,80 +425,110 @@ fail:
* initialize the swiotlb later using the slab allocator if needed.
* This should be just like above, but with some error catching.
*/
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- unsigned long nslabs =
- ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
- unsigned long bytes;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ unsigned int nareas;
unsigned char *vstart = NULL;
- unsigned int order;
+ unsigned int order, area_order;
+ bool retried = false;
int rc = 0;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
+ if (io_tlb_default_mem.nslabs)
return 0;
- /*
- * Get IO TLB memory from the low pages
- */
+ if (swiotlb_force_disable)
+ return 0;
+
+ io_tlb_default_mem.force_bounce = swiotlb_force_bounce;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
+ io_tlb_default_mem.phys_limit = zone_dma_limit;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
+ io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
+ else
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+retry:
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
- bytes = nslabs << IO_TLB_SHIFT;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
order);
if (vstart)
break;
order--;
+ nslabs = SLABS_PER_PAGE << order;
+ retried = true;
}
if (!vstart)
return -ENOMEM;
- if (order != get_order(bytes)) {
- pr_warn("only able to allocate %ld MB\n",
- (PAGE_SIZE << order) >> 20);
- nslabs = SLABS_PER_PAGE << order;
- }
- rc = swiotlb_late_init_with_tbl(vstart, nslabs);
- if (rc)
+ if (remap)
+ rc = remap(vstart, nslabs);
+ if (rc) {
free_pages((unsigned long)vstart, order);
- return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
- unsigned long bytes = nslabs << IO_TLB_SHIFT;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ return rc;
+ retried = true;
+ goto retry;
+ }
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
+ if (retried) {
+ pr_warn("only able to allocate %ld MB\n",
+ (PAGE_SIZE << order) >> 20);
+ }
- /* protect against double initialization */
- if (WARN_ON_ONCE(mem->nslabs))
- return -ENOMEM;
+ nareas = limit_nareas(default_nareas, nslabs);
+ area_order = get_order(array_size(sizeof(*mem->areas), nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(array_size(sizeof(*mem->slots), nslabs)));
if (!mem->slots)
- return -ENOMEM;
+ goto error_slots;
- set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
+ nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
+
+error_slots:
+ free_pages((unsigned long)mem->areas, area_order);
+error_area:
+ free_pages((unsigned long)vstart, order);
+ return -ENOMEM;
}
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
+ unsigned int area_order;
+
+ if (swiotlb_force_bounce)
+ return;
if (!mem->nslabs)
return;
@@ -388,9 +540,14 @@ void __init swiotlb_exit(void)
set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ mem->nareas));
+ free_pages((unsigned long)mem->areas, area_order);
free_pages(tbl_vaddr, get_order(tbl_size));
free_pages((unsigned long)mem->slots, get_order(slots_size));
} else {
+ memblock_free_late(__pa(mem->areas),
+ array_size(sizeof(*mem->areas), mem->nareas));
memblock_free_late(mem->start, tbl_size);
memblock_free_late(__pa(mem->slots), slots_size);
}
@@ -398,47 +555,330 @@ void __init swiotlb_exit(void)
memset(mem, 0, sizeof(*mem));
}
-/*
- * Return the offset into a iotlb slot required to keep the device happy.
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * alloc_dma_pages() - allocate pages to be used for DMA
+ * @gfp: GFP flags for the allocation.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ *
+ * Allocate pages from the buddy allocator. If successful, make the allocated
+ * pages decrypted that they can be used for DMA.
+ *
+ * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
+ * if the allocated physical address was above @phys_limit.
+ */
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
+{
+ unsigned int order = get_order(bytes);
+ struct page *page;
+ phys_addr_t paddr;
+ void *vaddr;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ paddr = page_to_phys(page);
+ if (paddr + bytes - 1 > phys_limit) {
+ __free_pages(page, order);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ vaddr = phys_to_virt(paddr);
+ if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ goto error;
+ return page;
+
+error:
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(page, order);
+ return NULL;
+}
+
+/**
+ * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
+ * @dev: Device for which a memory pool is allocated.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ * @gfp: GFP flags for the allocation.
+ *
+ * Return: Allocated pages, or %NULL on allocation failure.
+ */
+static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
+ u64 phys_limit, gfp_t gfp)
+{
+ struct page *page;
+
+ /*
+ * Allocate from the atomic pools if memory is encrypted and
+ * the allocation is atomic, because decrypting may block.
+ */
+ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ void *vaddr;
+
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return NULL;
+
+ return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
+ dma_coherent_ok);
+ }
+
+ gfp &= ~GFP_ZONEMASK;
+ if (phys_limit <= zone_dma_limit)
+ gfp |= __GFP_DMA;
+ else if (phys_limit <= DMA_BIT_MASK(32))
+ gfp |= __GFP_DMA32;
+
+ while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (__GFP_DMA32 | __GFP_DMA)))
+ gfp |= __GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ !(gfp & __GFP_DMA))
+ gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
+ else
+ return NULL;
+ }
+
+ return page;
+}
+
+/**
+ * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
+ * @vaddr: Virtual address of the buffer.
+ * @bytes: Size of the buffer.
+ */
+static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(NULL, vaddr, bytes))
+ return;
+
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(virt_to_page(vaddr), get_order(bytes));
+}
+
+/**
+ * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
+ * @dev: Device for which a memory pool is allocated.
+ * @minslabs: Minimum number of slabs.
+ * @nslabs: Desired (maximum) number of slabs.
+ * @nareas: Number of areas.
+ * @phys_limit: Maximum DMA buffer physical address.
+ * @gfp: GFP flags for the allocations.
+ *
+ * Allocate and initialize a new IO TLB memory pool. The actual number of
+ * slabs may be reduced if allocation of @nslabs fails. If even
+ * @minslabs cannot be allocated, this function fails.
+ *
+ * Return: New memory pool, or %NULL on allocation failure.
+ */
+static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
+ unsigned long minslabs, unsigned long nslabs,
+ unsigned int nareas, u64 phys_limit, gfp_t gfp)
+{
+ struct io_tlb_pool *pool;
+ unsigned int slot_order;
+ struct page *tlb;
+ size_t pool_size;
+ size_t tlb_size;
+
+ if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+ nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
+ pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
+ pool = kzalloc(pool_size, gfp);
+ if (!pool)
+ goto error;
+ pool->areas = (void *)pool + sizeof(*pool);
+
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ if (nslabs <= minslabs)
+ goto error_tlb;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ }
+
+ slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
+ pool->slots = (struct io_tlb_slot *)
+ __get_free_pages(gfp, slot_order);
+ if (!pool->slots)
+ goto error_slots;
+
+ swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
+ return pool;
+
+error_slots:
+ swiotlb_free_tlb(page_address(tlb), tlb_size);
+error_tlb:
+ kfree(pool);
+error:
+ return NULL;
+}
+
+/**
+ * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
+ * @work: Pointer to dyn_alloc in struct io_tlb_mem.
+ */
+static void swiotlb_dyn_alloc(struct work_struct *work)
+{
+ struct io_tlb_mem *mem =
+ container_of(work, struct io_tlb_mem, dyn_alloc);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
+ default_nareas, mem->phys_limit, GFP_KERNEL);
+ if (!pool) {
+ pr_warn_ratelimited("Failed to allocate new pool");
+ return;
+ }
+
+ add_mem_pool(mem, pool);
+}
+
+/**
+ * swiotlb_dyn_free() - RCU callback to free a memory pool
+ * @rcu: RCU head in the corresponding struct io_tlb_pool.
+ */
+static void swiotlb_dyn_free(struct rcu_head *rcu)
+{
+ struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
+ size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
+ size_t tlb_size = pool->end - pool->start;
+
+ free_pages((unsigned long)pool->slots, get_order(slots_size));
+ swiotlb_free_tlb(pool->vaddr, tlb_size);
+ kfree(pool);
+}
+
+/**
+ * __swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * @dev: Device which has mapped the DMA buffer.
+ * @paddr: Physical address within the DMA buffer.
+ *
+ * Find the IO TLB memory pool descriptor which contains the given physical
+ * address, if any. This function is for use only when the dev is known to
+ * be using swiotlb. Use swiotlb_find_pool() for the more general case
+ * when this condition is not met.
+ *
+ * Return: Memory pool which contains @paddr, or %NULL if none.
+ */
+struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+
+ list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+ pool = NULL;
+out:
+ rcu_read_unlock();
+ return pool;
+}
+
+/**
+ * swiotlb_del_pool() - remove an IO TLB pool from a device
+ * @dev: Owning device.
+ * @pool: Memory pool to be removed.
+ */
+static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_del_rcu(&pool->node);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+ call_rcu(&pool->rcu, swiotlb_dyn_free);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_dev_init() - initialize swiotlb fields in &struct device
+ * @dev: Device to be initialized.
+ */
+void swiotlb_dev_init(struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
+ spin_lock_init(&dev->dma_io_tlb_lock);
+ dev->dma_uses_io_tlb = false;
+#endif
+}
+
+/**
+ * swiotlb_align_offset() - Get required offset into an IO TLB allocation.
+ * @dev: Owning device.
+ * @align_mask: Allocation alignment mask.
+ * @addr: DMA address.
+ *
+ * Return the minimum offset from the start of an IO TLB allocation which is
+ * required for a given buffer address and allocation alignment to keep the
+ * device happy.
+ *
+ * First, the address bits covered by min_align_mask must be identical in the
+ * original address and the bounce buffer address. High bits are preserved by
+ * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra
+ * padding bytes before the bounce buffer.
+ *
+ * Second, @align_mask specifies which bits of the first allocated slot must
+ * be zero. This may require allocating additional padding slots, and then the
+ * offset (in bytes) from the first such padding slot is returned.
*/
-static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+static unsigned int swiotlb_align_offset(struct device *dev,
+ unsigned int align_mask, u64 addr)
{
- return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+ return addr & dma_get_min_align_mask(dev) &
+ (align_mask | (IO_TLB_SIZE - 1));
}
/*
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir, struct io_tlb_pool *mem)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
- unsigned int tlb_offset, orig_addr_offset;
+ int tlb_offset;
if (orig_addr == INVALID_PHYS_ADDR)
return;
- tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
- orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
- if (tlb_offset < orig_addr_offset) {
- dev_WARN_ONCE(dev, 1,
- "Access before mapping start detected. orig offset %u, requested offset %u.\n",
- orig_addr_offset, tlb_offset);
- return;
- }
-
- tlb_offset -= orig_addr_offset;
- if (tlb_offset > alloc_size) {
- dev_WARN_ONCE(dev, 1,
- "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
- alloc_size, size, tlb_offset);
- return;
- }
+ /*
+ * It's valid for tlb_offset to be negative. This can happen when the
+ * "offset" returned by swiotlb_align_offset() is non-zero, and the
+ * tlb_addr is pointing within the first "offset" bytes of the second
+ * or subsequent slots of the allocated swiotlb area. While it's not
+ * valid for tlb_addr to be pointing within the first "offset" bytes
+ * of the first slot, there's no way to check for such an error since
+ * this function can't distinguish the first slot from the second and
+ * subsequent slots.
+ */
+ tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
+ swiotlb_align_offset(dev, 0, orig_addr);
orig_addr += tlb_offset;
alloc_size -= tlb_offset;
@@ -451,9 +891,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
- char *buffer;
+ struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -461,12 +900,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
- memcpy(vaddr, buffer + offset, sz);
+ memcpy_from_page(vaddr, page, offset, sz);
else
- memcpy(buffer + offset, vaddr, sz);
- kunmap_atomic(buffer);
+ memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -481,174 +919,549 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
}
-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}
/*
* Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
*/
static inline unsigned long get_max_slots(unsigned long boundary_mask)
{
- if (boundary_mask == ~0UL)
- return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
- return nr_slots(boundary_mask + 1);
+ return (boundary_mask >> IO_TLB_SHIFT) + 1;
}
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
{
- if (index >= mem->nslabs)
+ if (index >= mem->area_nslabs)
return 0;
return index;
}
/*
- * Find a suitable number of IO TLB entries size that will fit this request and
- * allocate a buffer from that IO TLB pool.
+ * Track the total used slots with a global atomic value in order to have
+ * correct information to determine the high water mark. The mem_used()
+ * function gives imprecise results because there's no locking across
+ * multiple areas.
*/
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size, unsigned int alloc_align_mask)
+#ifdef CONFIG_DEBUG_FS
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ unsigned long old_hiwater, new_used;
+
+ new_used = atomic_long_add_return(nslots, &mem->total_used);
+ old_hiwater = atomic_long_read(&mem->used_hiwater);
+ do {
+ if (new_used <= old_hiwater)
+ break;
+ } while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
+ &old_hiwater, new_used));
+}
+
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+#ifdef CONFIG_DEBUG_FS
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_add(nslots, &mem->transient_nslabs);
+}
+
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->transient_nslabs);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_search_pool_area() - search one memory area in one pool
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @area_index: Index of the IO TLB memory area to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Find a suitable sequence of IO TLB entries for the request and allocate
+ * a buffer from the given IO TLB memory area.
+ * This function takes care of locking.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
+ int area_index, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
+{
+ struct io_tlb_area *area = pool->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
+ phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
- unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
unsigned int nslots = nr_slots(alloc_size), stride;
- unsigned int index, wrap, count = 0, i;
- unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr);
+ unsigned int index, slots_checked, count = 0, i;
unsigned long flags;
+ unsigned int slot_base;
+ unsigned int slot_index;
BUG_ON(!nslots);
+ BUG_ON(area_index >= pool->nareas);
+
+ /*
+ * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
+ * page-aligned in the absence of any other alignment requirements.
+ * 'alloc_align_mask' was later introduced to specify the alignment
+ * explicitly, however this is passed as zero for streaming mappings
+ * and so we preserve the old behaviour there in case any drivers are
+ * relying on it.
+ */
+ if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE)
+ alloc_align_mask = PAGE_SIZE - 1;
+
+ /*
+ * Ensure that the allocation is at least slot-aligned and update
+ * 'iotlb_align_mask' to ignore bits that will be preserved when
+ * offsetting into the allocation.
+ */
+ alloc_align_mask |= (IO_TLB_SIZE - 1);
+ iotlb_align_mask &= ~alloc_align_mask;
/*
* For mappings with an alignment requirement don't bother looping to
- * unaligned slots once we found an aligned one. For allocations of
- * PAGE_SIZE or larger only look for page aligned allocations.
+ * unaligned slots once we found an aligned one.
*/
- stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
- if (alloc_size >= PAGE_SIZE)
- stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
+ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));
- spin_lock_irqsave(&mem->lock, flags);
- if (unlikely(nslots > mem->nslabs - mem->used))
+ spin_lock_irqsave(&area->lock, flags);
+ if (unlikely(nslots > pool->area_nslabs - area->used))
goto not_found;
- index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
- do {
- if (orig_addr &&
- (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
- (orig_addr & iotlb_align_mask)) {
- index = wrap_index(mem, index + 1);
+ slot_base = area_index * pool->area_nslabs;
+ index = area->index;
+
+ for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ phys_addr_t tlb_addr;
+
+ slot_index = slot_base + index;
+ tlb_addr = slot_addr(tbl_dma_addr, slot_index);
+
+ if ((tlb_addr & alloc_align_mask) ||
+ (orig_addr && (tlb_addr & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask))) {
+ index = wrap_area_index(pool, index + 1);
+ slots_checked++;
continue;
}
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
- if (!iommu_is_span_boundary(index, nslots,
+ if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[index].list >= nslots)
+ if (pool->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_index(mem, index + stride);
- } while (index != wrap);
+ index = wrap_area_index(pool, index + stride);
+ slots_checked += stride;
+ }
not_found:
- spin_unlock_irqrestore(&mem->lock, flags);
+ spin_unlock_irqrestore(&area->lock, flags);
return -1;
found:
- for (i = index; i < index + nslots; i++) {
- mem->slots[i].list = 0;
- mem->slots[i].alloc_size =
- alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot onwards
+ * and set the list of free entries to '0' indicating unavailable.
+ */
+ for (i = slot_index; i < slot_index + nslots; i++) {
+ pool->slots[i].list = 0;
+ pool->slots[i].alloc_size = alloc_size - (offset +
+ ((i - slot_index) << IO_TLB_SHIFT));
}
- for (i = index - 1;
+ for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
- mem->slots[i].list; i--)
- mem->slots[i].list = ++count;
+ pool->slots[i].list; i--)
+ pool->slots[i].list = ++count;
/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < mem->nslabs)
- mem->index = index + nslots;
- else
- mem->index = 0;
- mem->used += nslots;
+ area->index = wrap_area_index(pool, index + nslots);
+ area->used += nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
+ return slot_index;
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_search_area() - search one memory area in all pools
+ * @dev: Device which maps the buffer.
+ * @start_cpu: Start CPU number.
+ * @cpu_offset: Offset from @start_cpu.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search one memory area in all pools for a sequence of slots that match the
+ * allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_area(struct device *dev, int start_cpu,
+ int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ int area_index;
+ int index = -1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (cpu_offset >= pool->nareas)
+ continue;
+ area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+ index = swiotlb_search_pool_area(dev, pool, area_index,
+ orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0) {
+ *retpool = pool;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return index;
+}
+
+/**
+ * swiotlb_find_slots() - search for slots in the whole swiotlb
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search through the whole software IO TLB to find a sequence of slots that
+ * match the allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ unsigned long nslabs;
+ unsigned long flags;
+ u64 phys_limit;
+ int cpu, i;
+ int index;
+
+ if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+ return -1;
+
+ cpu = raw_smp_processor_id();
+ for (i = 0; i < default_nareas; ++i) {
+ index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
+ alloc_align_mask, &pool);
+ if (index >= 0)
+ goto found;
+ }
+
+ if (!mem->can_grow)
+ return -1;
+
+ schedule_work(&mem->dyn_alloc);
+
+ nslabs = nr_slots(alloc_size);
+ phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ GFP_NOWAIT);
+ if (!pool)
+ return -1;
+
+ index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index < 0) {
+ swiotlb_dyn_free(&pool->rcu);
+ return -1;
+ }
+
+ pool->transient = true;
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+ inc_transient_used(mem, pool->nslabs);
+
+found:
+ WRITE_ONCE(dev->dma_uses_io_tlb, true);
- spin_unlock_irqrestore(&mem->lock, flags);
+ /*
+ * The general barrier orders reads and writes against a presumed store
+ * of the SWIOTLB buffer address by a device driver (to a driver private
+ * data structure). It serves two purposes.
+ *
+ * First, the store to dev->dma_uses_io_tlb must be ordered before the
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+ *
+ * Second, the load from mem->pools must be ordered before the same
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be observed by another CPU before an update of the RCU list
+ * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+ * atomicity).
+ *
+ * See also the comment in swiotlb_find_pool().
+ */
+ smp_mb();
+
+ *retpool = pool;
return index;
}
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_pool *pool;
+ int start, i;
+ int index;
+
+ *retpool = pool = &dev->dma_io_tlb_mem->defpool;
+ i = start = raw_smp_processor_id() & (pool->nareas - 1);
+ do {
+ index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= pool->nareas)
+ i = 0;
+ } while (i != start);
+ return -1;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+#ifdef CONFIG_DEBUG_FS
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is accurate in this version of the function, because an atomic
+ * counter is available if CONFIG_DEBUG_FS is set.
+ *
+ * Return: Number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+
+/**
+ * mem_pool_used() - get number of used slots in a memory pool
+ * @pool: Software IO TLB memory pool.
+ *
+ * The result is not accurate, see mem_used().
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_pool_used(struct io_tlb_pool *pool)
+{
+ int i;
+ unsigned long used = 0;
+
+ for (i = 0; i < pool->nareas; i++)
+ used += pool->areas[i].used;
+ return used;
+}
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is not accurate, because there is no locking of individual
+ * areas.
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ struct io_tlb_pool *pool;
+ unsigned long used = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node)
+ used += mem_pool_used(pool);
+ rcu_read_unlock();
+
+ return used;
+#else
+ return mem_pool_used(&mem->defpool);
+#endif
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) physical IO buffer address
+ * @mapping_size: Requested size of the actual bounce buffer, excluding
+ * any pre- or post-padding for alignment
+ * @alloc_align_mask: Required start and end alignment of the allocated buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The allocated space starts at an alignment specified by alloc_align_mask,
+ * and the size of the allocated space is rounded up so that the total amount
+ * of allocated space is a multiple of (alloc_align_mask + 1). If
+ * alloc_align_mask is zero, the allocated space may be at any alignment and
+ * the size is not rounded up.
+ *
+ * The returned address is within the allocated space and matches the bits
+ * of orig_addr that are specified in the DMA min_align_mask for the device. As
+ * such, this returned address may be offset from the beginning of the allocated
+ * space. The bounce buffer space starting at the returned address for
+ * mapping_size bytes is initialized to the contents of the original IO buffer
+ * area. Any pre-padding (due to an offset) and any post-padding (due to
+ * rounding-up the size) is not initialized.
+ */
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
- size_t mapping_size, size_t alloc_size,
- unsigned int alloc_align_mask, enum dma_data_direction dir,
- unsigned long attrs)
+ size_t mapping_size, unsigned int alloc_align_mask,
+ enum dma_data_direction dir, unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int offset;
+ struct io_tlb_pool *pool;
unsigned int i;
+ size_t size;
int index;
phys_addr_t tlb_addr;
+ unsigned short pad_slots;
- if (!mem)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
- if (mapping_size > alloc_size) {
- dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
- mapping_size, alloc_size);
- return (phys_addr_t)DMA_MAPPING_ERROR;
- }
+ /*
+ * The default swiotlb memory pool is allocated with PAGE_SIZE
+ * alignment. If a mapping is requested with larger alignment,
+ * the mapping may be unable to use the initial slot(s) in all
+ * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request
+ * of or near the maximum mapping size would always fail.
+ */
+ dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK,
+ "Alloc alignment may prevent fulfilling requests with max mapping_size\n");
- index = swiotlb_find_slots(dev, orig_addr,
- alloc_size + offset, alloc_align_mask);
+ offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
+ size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
+ index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, mem->nslabs, mem->used);
+ size, mem->nslabs, mem_used(mem));
return (phys_addr_t)DMA_MAPPING_ERROR;
}
/*
+ * If dma_skip_sync was set, reset it on first SWIOTLB buffer
+ * mapping to always sync SWIOTLB buffers.
+ */
+ dma_reset_need_sync(dev);
+
+ /*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nr_slots(alloc_size + offset); i++)
- mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ pad_slots = offset >> IO_TLB_SHIFT;
+ offset &= (IO_TLB_SIZE - 1);
+ index += pad_slots;
+ pool->slots[index].pad_slots = pad_slots;
+ for (i = 0; i < (nr_slots(size) - pad_slots); i++)
+ pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(pool->start, index) + offset;
+ /*
+ * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
+ * the original buffer to the TLB buffer before initiating DMA in order
+ * to preserve the original's data if the device does a partial write,
+ * i.e. if the device doesn't overwrite the entire buffer. Preserving
+ * the original data, even if it's garbage, is necessary to match
+ * hardware behavior. Use of swiotlb is supposed to be transparent,
+ * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool);
return tlb_addr;
}
-static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *mem)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
- unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
- int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
- int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
+ int index, nslots, aindex;
+ struct io_tlb_area *area;
int count, i;
+ index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+ index -= mem->slots[index].pad_slots;
+ nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ aindex = index / mem->area_nslabs;
+ area = &mem->areas[aindex];
+
/*
* Return the buffer to the free list by setting the corresponding
* entries to indicate the number of contiguous entries available.
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- spin_lock_irqsave(&mem->lock, flags);
+ BUG_ON(aindex >= mem->nareas);
+
+ spin_lock_irqsave(&area->lock, flags);
if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
count = mem->slots[index + nslots].list;
else
@@ -662,6 +1475,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
mem->slots[i].list = ++count;
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
}
/*
@@ -672,41 +1486,83 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
i--)
mem->slots[i].list = ++count;
- mem->used -= nslots;
- spin_unlock_irqrestore(&mem->lock, flags);
+ area->used -= nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ dec_used(dev->dma_io_tlb_mem, nslots);
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_del_transient() - delete a transient memory pool
+ * @dev: Device which mapped the buffer.
+ * @tlb_addr: Physical address within a bounce buffer.
+ * @pool: Pointer to the transient memory pool to be checked and deleted.
+ *
+ * Check whether the address belongs to a transient SWIOTLB memory pool.
+ * If yes, then delete the pool.
+ *
+ * Return: %true if @tlb_addr belonged to a transient pool that was released.
+ */
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *pool)
+{
+ if (!pool->transient)
+ return false;
+
+ dec_used(dev->dma_io_tlb_mem, pool->nslabs);
+ swiotlb_del_pool(dev, pool);
+ dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs);
+ return true;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static inline bool swiotlb_del_transient(struct device *dev,
+ phys_addr_t tlb_addr, struct io_tlb_pool *pool)
+{
+ return false;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
-void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs, struct io_tlb_pool *pool)
{
/*
* First, sync the memory before unmapping the entry
*/
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size,
+ DMA_FROM_DEVICE, pool);
- swiotlb_release_slots(dev, tlb_addr);
+ if (swiotlb_del_transient(dev, tlb_addr, pool))
+ return;
+ swiotlb_release_slots(dev, tlb_addr, pool);
}
-void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool);
else
BUG_ON(dir != DMA_FROM_DEVICE);
}
-void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool);
else
BUG_ON(dir != DMA_TO_DEVICE);
}
@@ -721,19 +1577,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
phys_addr_t swiotlb_addr;
dma_addr_t dma_addr;
- trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
- swiotlb_force);
+ trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
- swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
- attrs);
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
/* Ensure that the address returned is DMA'ble */
dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
- attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC,
+ swiotlb_find_pool(dev, swiotlb_addr));
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
@@ -747,7 +1602,26 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
+ int min_align_mask = dma_get_min_align_mask(dev);
+ int min_align = 0;
+
+ /*
+ * swiotlb_find_slots() skips slots according to
+ * min align mask. This affects max mapping size.
+ * Take it into acount here.
+ */
+ if (min_align_mask)
+ min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
+}
+
+/**
+ * is_swiotlb_allocated() - check if the default software IO TLB is initialized
+ */
+bool is_swiotlb_allocated(void)
+{
+ return io_tlb_default_mem.nslabs;
}
bool is_swiotlb_active(struct device *dev)
@@ -756,63 +1630,145 @@ bool is_swiotlb_active(struct device *dev)
return mem && mem->nslabs;
}
-EXPORT_SYMBOL_GPL(is_swiotlb_active);
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
+/**
+ * default_swiotlb_base() - get the base address of the default SWIOTLB
+ *
+ * Get the lowest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_base(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ io_tlb_default_mem.can_grow = false;
+#endif
+ return io_tlb_default_mem.defpool.start;
+}
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+/**
+ * default_swiotlb_limit() - get the address limit of the default SWIOTLB
+ *
+ * Get the highest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_limit(void)
{
- debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
- debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ return io_tlb_default_mem.phys_limit;
+#else
+ return io_tlb_default_mem.defpool.end - 1;
+#endif
}
-static int __init swiotlb_create_default_debugfs(void)
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+static unsigned long mem_transient_used(struct io_tlb_mem *mem)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ return atomic_long_read(&mem->transient_nslabs);
+}
- debugfs_dir = debugfs_create_dir("swiotlb", NULL);
- if (mem->nslabs) {
- mem->debugfs = debugfs_dir;
- swiotlb_create_debugfs_files(mem);
- }
+static int io_tlb_transient_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_transient_used(mem);
return 0;
}
-late_initcall(swiotlb_create_default_debugfs);
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get,
+ NULL, "%llu\n");
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
-#endif
+static int io_tlb_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
-#ifdef CONFIG_DMA_RESTRICTED_POOL
+ *val = mem_used(mem);
+ return 0;
+}
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+static int io_tlb_hiwater_get(void *data, u64 *val)
{
- struct io_tlb_mem *mem = rmem->priv;
+ struct io_tlb_mem *mem = data;
- mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
- swiotlb_create_debugfs_files(mem);
+ *val = atomic_long_read(&mem->used_hiwater);
+ return 0;
}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+
+static int io_tlb_hiwater_set(void *data, u64 val)
{
+ struct io_tlb_mem *mem = data;
+
+ /* Only allow setting to zero */
+ if (val != 0)
+ return -EINVAL;
+
+ atomic_long_set(&mem->used_hiwater, val);
+ return 0;
}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
+ io_tlb_hiwater_set, "%llu\n");
+
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
+ debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
+ &fops_io_tlb_used);
+ debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
+ &fops_io_tlb_hiwater);
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs,
+ mem, &fops_io_tlb_transient_used);
#endif
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
+ return 0;
+}
+
+late_initcall(swiotlb_create_default_debugfs);
+
+#else /* !CONFIG_DEBUG_FS */
+
+static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
+ unsigned int align;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0);
+ align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
+ index = swiotlb_find_slots(dev, 0, size, align, &pool);
if (index == -1)
return NULL;
- tlb_addr = slot_addr(mem->start, index);
+ tlb_addr = slot_addr(pool->start, index);
+ if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
+ dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
+ &tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
+ return NULL;
+ }
return pfn_to_page(PFN_DOWN(tlb_addr));
}
@@ -820,11 +1776,13 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
phys_addr_t tlb_addr = page_to_phys(page);
+ struct io_tlb_pool *pool;
- if (!is_swiotlb_buffer(dev, tlb_addr))
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool)
return false;
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
return true;
}
@@ -835,32 +1793,56 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct io_tlb_mem *mem = rmem->priv;
unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+ /* Set Per-device io tlb area to one */
+ unsigned int nareas = 1;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
/*
* Since multiple devices can share the same pool, the private data,
* io_tlb_mem struct, will be initialized by the first device attached
* to it.
*/
if (!mem) {
+ struct io_tlb_pool *pool;
+
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem)
return -ENOMEM;
+ pool = &mem->defpool;
- mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
- GFP_KERNEL);
- if (!mem->slots) {
+ pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL);
+ if (!pool->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ pool->areas = kcalloc(nareas, sizeof(*pool->areas),
+ GFP_KERNEL);
+ if (!pool->areas) {
+ kfree(pool->slots);
kfree(mem);
return -ENOMEM;
}
set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
rmem->size >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+ swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
+ false, nareas);
mem->force_bounce = true;
mem->for_alloc = true;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock_init(&mem->lock);
+ INIT_LIST_HEAD_RCU(&mem->pools);
+#endif
+ add_mem_pool(mem, pool);
rmem->priv = mem;
- rmem_swiotlb_debugfs_init(rmem);
+ swiotlb_create_debugfs_files(mem, rmem->name);
}
dev->dma_io_tlb_mem = mem;
@@ -889,11 +1871,6 @@ static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
- pr_err("Restricted DMA pool must be accessible within the linear mapping.");
- return -EINVAL;
- }
-
rmem->ops = &rmem_swiotlb_ops;
pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
index 92da32275af5..92da32275af5 100644
--- a/kernel/crash_dump.c
+++ b/kernel/elfcorehdr.c
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index 095c775e001e..2333d70802e4 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -6,8 +6,12 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
+# Branch profiling isn't noinstr-safe
+ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
-obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
-obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
+obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o
+obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..5c792b30c58a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,165 +1,34 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/context_tracking.h>
-#include <linux/entry-common.h>
+#include <linux/irq-entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
-#include <linux/audit.h>
#include <linux/tick.h>
-#include "common.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
- arch_check_user_regs(regs);
- lockdep_hardirqs_off(CALLER_ADDR0);
-
- CT_WARN_ON(ct_state() != CONTEXT_USER);
- user_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
-}
-
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
-
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
-{
- long ret = 0;
-
- /*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing(NULL);
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
- trace_sys_enter(regs, syscall);
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
-}
-
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
- if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
-
- return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
- return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
- long ret;
-
- __enter_from_user_mode(regs);
-
- instrumentation_begin();
- local_irq_enable();
- ret = __syscall_enter_from_user_work(regs, syscall);
- instrumentation_end();
-
- return ret;
-}
-
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
- instrumentation_begin();
- local_irq_enable();
- instrumentation_end();
-}
-
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- user_enter_irqoff();
- arch_exit_to_user_mode();
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
- __exit_to_user_mode();
-}
-
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
+#endif
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
- while (ti_work & EXIT_TO_USER_MODE_WORK) {
+ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();
if (ti_work & _TIF_UPROBE)
@@ -169,10 +38,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(regs);
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -194,125 +63,21 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long ti_work = read_thread_flags();
-
- lockdep_assert_irqs_disabled();
-
- /* Flush pending rcuog wakeup before the last need_resched() check */
- tick_nohz_user_enter_prepare();
-
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
-
- arch_exit_to_user_mode_prepare(regs, ti_work);
-
- /* Ensure that the address limit is intact and no locks are held */
- addr_limit_user_check();
- kmap_assert_nomap();
- lockdep_assert_irqs_disabled();
- lockdep_sys_exit();
-}
-
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
*/
-static inline bool report_single_step(unsigned long work)
-{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
- bool step;
+ for (;;) {
+ ti_work = __exit_to_user_mode_loop(regs, ti_work);
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
+ if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
+ return ti_work;
+ ti_work = read_thread_flags();
}
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
-}
-
-/*
- * Syscall specific exit to user mode preparation. Runs with interrupts
- * enabled.
- */
-static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- unsigned long nr = syscall_get_nr(current, regs);
-
- CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
-
- if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
- local_irq_enable();
- }
-
- rseq_syscall(regs);
-
- /*
- * Do one-time syscall specific work. If these work items are
- * enabled, we want to run them exactly once per syscall exit with
- * interrupts enabled.
- */
- if (unlikely(work & SYSCALL_WORK_EXIT))
- syscall_exit_work(regs, work);
-}
-
-static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- syscall_exit_to_user_mode_prepare(regs);
- local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
-}
-
-void syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- __syscall_exit_to_user_mode_work(regs);
-}
-
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- __syscall_exit_to_user_mode_work(regs);
- instrumentation_end();
- __exit_to_user_mode();
-}
-
-noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
-}
-
-noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- exit_to_user_mode_prepare(regs);
- instrumentation_end();
- __exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
@@ -327,7 +92,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
}
/*
- * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
@@ -338,26 +103,28 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
- * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
- if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+ (is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
- rcu_irq_enter();
+ ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -373,6 +140,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -380,19 +148,43 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+/**
+ * arch_irqentry_exit_need_resched - Architecture specific need resched function
+ *
+ * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
+ * Defaults return true.
+ *
+ * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
+ */
+static inline bool arch_irqentry_exit_need_resched(void);
+
+#ifndef arch_irqentry_exit_need_resched
+static inline bool arch_irqentry_exit_need_resched(void) { return true; }
+#endif
+
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
- if (need_resched())
+ if (need_resched() && arch_irqentry_exit_need_resched())
preempt_schedule_irq();
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -412,21 +204,17 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
- rcu_irq_exit();
+ ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
- static_call(irqentry_exit_cond_resched)();
-#else
+ if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
-#endif
- }
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
@@ -436,7 +224,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
* was not watching on entry.
*/
if (state.exit_rcu)
- rcu_irq_exit();
+ ct_irq_exit();
}
}
@@ -449,9 +237,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
- rcu_nmi_enter();
+ ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
@@ -465,11 +254,11 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
}
instrumentation_end();
- rcu_nmi_exit();
+ ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
new file mode 100644
index 000000000000..940a597ded40
--- /dev/null
+++ b/kernel/entry/syscall-common.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/audit.h>
+#include <linux/entry-common.h>
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long work)
+{
+ long ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
+ trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ ptrace_report_syscall_exit(regs, step);
+}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 0b6379adff6b..a9055eccb27e 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -68,15 +69,16 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
if (offset || len || selector)
return -EINVAL;
break;
- case PR_SYS_DISPATCH_ON:
+ case PR_SYS_DISPATCH_EXCLUSIVE_ON:
/*
* Validate the direct dispatcher region just for basic
* sanity against overflow and a 0-sized dispatcher
@@ -85,24 +87,88 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
*/
if (offset && offset + len <= offset)
return -EINVAL;
-
- if (selector && !access_ok(selector, sizeof(*selector)))
- return -EFAULT;
-
+ break;
+ case PR_SYS_DISPATCH_INCLUSIVE_ON:
+ if (len == 0 || offset + len <= offset)
+ return -EINVAL;
+ /*
+ * Invert the range, the check in syscall_user_dispatch()
+ * supports wrap-around.
+ */
+ offset = offset + len;
+ len = -len;
break;
default:
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+ *
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (mode != PR_SYS_DISPATCH_OFF && selector &&
+ !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
+
+ if (mode != PR_SYS_DISPATCH_OFF)
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
- if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/entry/kvm.c b/kernel/entry/virt.c
index 96d476e06c77..c52f99249763 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/virt.c
@@ -1,37 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/entry-kvm.h>
-#include <linux/kvm_host.h>
+#include <linux/entry-virt.h>
-static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+static int xfer_to_guest_mode_work(unsigned long ti_work)
{
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- if (ti_work & _TIF_SIGPENDING) {
- kvm_handle_signal_exit(vcpu);
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
return -EINTR;
- }
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
- ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
+ ret = arch_xfer_to_guest_mode_handle_work(ti_work);
if (ret)
return ret;
ti_work = read_thread_flags();
- } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK);
return 0;
}
-int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
+int xfer_to_guest_mode_handle_work(void)
{
unsigned long ti_work;
@@ -47,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
- return xfer_to_guest_mode_work(vcpu, ti_work);
+ return xfer_to_guest_mode_work(ti_work);
}
EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 8591c180b52b..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,5 @@
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <linux/sched/task_stack.h>
+#include <linux/uprobes.h>
#include "internal.h"
@@ -21,6 +22,7 @@ struct callchain_cpus_entries {
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
+static const int six_hundred_forty_kb = 640 * 1024;
static inline size_t perf_callchain_entry__sizeof(void)
{
@@ -29,7 +31,7 @@ static inline size_t perf_callchain_entry__sizeof(void)
sysctl_perf_event_max_contexts_per_stack));
}
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
@@ -176,23 +178,65 @@ put_callchain_entry(int rctx)
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
+static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry,
+ int start_entry_idx)
+{
+#ifdef CONFIG_UPROBES
+ struct uprobe_task *utask = current->utask;
+ struct return_instance *ri;
+ __u64 *cur_ip, *last_ip, tramp_addr;
+
+ if (likely(!utask || !utask->return_instances))
+ return;
+
+ cur_ip = &entry->ip[start_entry_idx];
+ last_ip = &entry->ip[entry->nr - 1];
+ ri = utask->return_instances;
+ tramp_addr = uprobe_get_trampoline_vaddr();
+
+ /*
+ * If there are pending uretprobes for the current thread, they are
+ * recorded in a list inside utask->return_instances; each such
+ * pending uretprobe replaces traced user function's return address on
+ * the stack, so when stack trace is captured, instead of seeing
+ * actual function's return address, we'll have one or many uretprobe
+ * trampoline addresses in the stack trace, which are not helpful and
+ * misleading to users.
+ * So here we go over the pending list of uretprobes, and each
+ * encountered trampoline address is replaced with actual return
+ * address.
+ */
+ while (ri && cur_ip <= last_ip) {
+ if (*cur_ip == tramp_addr) {
+ *cur_ip = ri->orig_ret_vaddr;
+ ri = ri->next;
+ }
+ cur_ip++;
+ }
+#endif
+}
+
struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark)
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
+ u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
- int rctx;
+ int rctx, start_entry_idx;
+
+ /* crosstask is not supported for user stacks */
+ if (crosstask && user && !kernel)
+ return NULL;
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
- ctx.entry = entry;
- ctx.max_stack = max_stack;
- ctx.nr = entry->nr = init_nr;
- ctx.contexts = 0;
- ctx.contexts_maxed = false;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = 0;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
@@ -200,27 +244,31 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
perf_callchain_kernel(&ctx, regs);
}
- if (user) {
+ if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
+ if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+ goto exit_put;
+ regs = task_pt_regs(current);
}
- if (regs) {
- mm_segment_t fs;
-
- if (crosstask)
- goto exit_put;
+ if (defer_cookie) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one, and add
+ * the cookie after it (it will be cut off when the
+ * user stack is copied to the callchain).
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ perf_callchain_store_context(&ctx, defer_cookie);
+ goto exit_put;
+ }
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
- perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
- }
+ start_entry_idx = entry->nr;
+ perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:
@@ -229,12 +277,8 @@ exit_put:
return entry;
}
-/*
- * Used for sysctl_perf_event_max_stack and
- * sysctl_perf_event_max_contexts_per_stack.
- */
-int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int perf_event_max_stack_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
@@ -255,3 +299,32 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
return ret;
}
+
+static const struct ctl_table callchain_sysctl_table[] = {
+ {
+ .procname = "perf_event_max_stack",
+ .data = &sysctl_perf_event_max_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&six_hundred_forty_kb,
+ },
+ {
+ .procname = "perf_event_max_contexts_per_stack",
+ .data = &sysctl_perf_event_max_contexts_per_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+};
+
+static int __init init_callchain_sysctls(void)
+{
+ register_sysctl_init("kernel", callchain_sysctl_table);
+ return 0;
+}
+core_initcall(init_callchain_sysctls);
+
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc18664f49b0..ece716879cbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -54,6 +54,9 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
+#include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
#include "internal.h"
@@ -154,28 +157,70 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x01,
+ EVENT_PINNED = 0x02,
+ EVENT_TIME = 0x04,
+ EVENT_FROZEN = 0x08,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x10,
+ EVENT_CGROUP = 0x20,
+
+ /* compound helpers */
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+ EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+ raw_spin_lock(&ctx->lock);
+ WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- raw_spin_lock(&cpuctx->ctx.lock);
+ __perf_ctx_lock(&cpuctx->ctx);
if (ctx)
- raw_spin_lock(&ctx->lock);
+ __perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx);
}
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@@ -183,6 +228,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -216,7 +269,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -261,6 +314,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct perf_cpu_context *cpuctx;
struct event_function_struct efs = {
.event = event,
.func = func,
@@ -288,22 +342,25 @@ again:
if (!task_function_call(task, event_function, &efs))
return;
- raw_spin_lock_irq(&ctx->lock);
+ local_irq_disable();
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
- if (task == TASK_TOMBSTONE) {
- raw_spin_unlock_irq(&ctx->lock);
- return;
- }
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
goto again;
}
func(event, NULL, ctx, data);
- raw_spin_unlock_irq(&ctx->lock);
+unlock:
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
}
/*
@@ -313,7 +370,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -366,18 +423,8 @@ unlock:
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_TIME = 0x4,
- /* see ctx_resched() for details */
- EVENT_CPU = 0x8,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
static void perf_sched_delayed(struct work_struct *work);
@@ -386,8 +433,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -406,6 +451,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -417,8 +467,8 @@ static struct kmem_cache *perf_event_cache;
*/
int sysctl_perf_event_paranoid __read_mostly = 2;
-/* Minimum for 512 kiB + 1 user control page */
-int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
+static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/*
* max perf event sample rate
@@ -428,6 +478,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
@@ -447,10 +498,10 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -471,9 +522,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
return 0;
}
-int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-
-int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -493,6 +542,52 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
return 0;
}
+static const struct ctl_table events_core_sysctl_table[] = {
+ /*
+ * User-space relies on this file as a feature check for
+ * perf_events being enabled. It's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_event_max_sample_rate_handler,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+};
+
+static int __init init_events_core_sysctls(void)
+{
+ register_sysctl_init("kernel", events_core_sysctl_table);
+ return 0;
+}
+core_initcall(init_events_core_sysctls);
+
+
/*
* perf samples are done in some very critical code paths (NMIs).
* If they take too much CPU time, the system can lock up and not
@@ -533,7 +628,7 @@ void perf_sample_event_took(u64 sample_len_ns)
__this_cpu_write(running_sample_length, running_len);
/*
- * Note: this will be biased artifically low until we have
+ * Note: this will be biased artificially low until we have
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
@@ -570,13 +665,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -602,10 +690,10 @@ static inline u64 perf_event_clock(struct perf_event *event)
*
* Event groups make things a little more complicated, but not terribly so. The
* rules for a group are that if the group leader is OFF the entire group is
- * OFF, irrespecive of what the group member states are. This results in
+ * OFF, irrespective of what the group member states are. This results in
* __perf_effective_state().
*
- * A futher ramification is that when a group leader flips between OFF and
+ * A further ramification is that when a group leader flips between OFF and
* !OFF, we need to update all group member times.
*
*
@@ -674,13 +762,56 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -719,35 +850,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
- struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
/*
* ensure we access cgroup data only when needed and
@@ -756,19 +903,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current, event->ctx);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (info->active)
+ __update_cgrp_time(info, perf_clock(), true);
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
struct cgroup_subsys_state *css;
@@ -777,127 +924,64 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task, ctx);
+ WARN_ON_ONCE(!ctx->nr_cgroups);
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
-
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
{
- struct perf_cpu_context *cpuctx;
- struct list_head *list;
- unsigned long flags;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cgroup *cgrp;
/*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
*/
- local_irq_save(flags);
-
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
-
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
-
- local_irq_restore(flags);
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
/*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
+ * Re-check, could've raced vs perf_remove_from_context().
*/
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- rcu_read_unlock();
-}
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ perf_ctx_disable(&cpuctx->ctx, true);
- rcu_read_lock();
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
*/
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(prev, NULL);
-
+ cpuctx->cgrp = cgrp;
/*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
*/
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
- rcu_read_unlock();
+ perf_ctx_enable(&cpuctx->ctx, true);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -908,14 +992,14 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
int cpu, heap_size, ret = 0;
/*
- * Allow storage to have sufficent space for an iterator for each
+ * Allow storage to have sufficient space for an iterator for each
* possibly nested cgroup plus an iterator for events with no cgroup.
*/
for (heap_size = 1; css; css = css->parent)
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -947,22 +1031,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int ret = 0;
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
- if (IS_ERR(css)) {
- ret = PTR_ERR(css);
- goto out;
- }
+ if (IS_ERR(css))
+ return PTR_ERR(css);
ret = perf_cgroup_ensure_storage(event, css);
if (ret)
- goto out;
+ return ret;
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -976,20 +1058,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
perf_detach_cgroup(event);
ret = -EINVAL;
}
-out:
- fdput(f);
return ret;
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -997,30 +1069,18 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
- /*
- * Since setting cpuctx->cgrp is conditional on the current @cgrp
- * matching the event's cgroup, we must do this for every new event,
- * because if the first would mismatch, the second would not try again
- * and we would leave cpuctx->cgrp unset.
- */
- if (ctx->is_active && !cpuctx->cgrp) {
- struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- cpuctx->cgrp = cgrp;
- }
-
if (ctx->nr_cgroups++)
return;
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}
static inline void
@@ -1031,6 +1091,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -1040,10 +1102,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (--ctx->nr_cgroups)
return;
- if (ctx->is_active && cpuctx->cgrp)
- cpuctx->cgrp = NULL;
-
- list_del(&cpuctx->cgrp_cpuctx_entry);
+ cpuctx->cgrp = NULL;
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1066,17 +1125,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-{
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1088,22 +1138,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1117,6 +1161,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -1129,34 +1177,30 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1165,74 +1209,63 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- timer->function = perf_mux_hrtimer_handler;
+ raw_spin_lock_init(&cpc->hrtimer_lock);
+ hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
+static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
+{
+ return *this_cpu_ptr(pmu->cpu_pmu_context);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
+ WARN_ON_ONCE(*count == 0);
}
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static inline void perf_pmu_read(struct perf_event *event)
{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1240,26 +1273,11 @@ static void get_ctx(struct perf_event_context *ctx)
refcount_inc(&ctx->refcount);
}
-static void *alloc_task_ctx_data(struct pmu *pmu)
-{
- if (pmu->task_ctx_cache)
- return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
-
- return NULL;
-}
-
-static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
-{
- if (pmu->task_ctx_cache && task_ctx_data)
- kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
-}
-
static void free_ctx(struct rcu_head *head)
{
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1271,6 +1289,10 @@ static void put_ctx(struct perf_event_context *ctx)
if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
+ } else {
+ smp_mb__after_atomic(); /* pairs with wait_var_event() */
+ if (ctx->task == TASK_TOMBSTONE)
+ wake_up_var(&ctx->refcount);
}
}
@@ -1332,8 +1354,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
- * perf_event::mmap_mutex
* mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -1444,7 +1467,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1460,7 +1483,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1473,7 +1496,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1500,12 +1523,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1525,22 +1548,61 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ lockdep_assert_held(&ctx->lock);
+
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -1614,14 +1676,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1664,12 +1734,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1678,14 +1749,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1735,14 +1817,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1754,14 +1837,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1773,6 +1854,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1783,6 +1868,14 @@ perf_event_groups_next(struct perf_event *event)
typeof(*event), group_node))
/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
+/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1812,11 +1905,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1828,28 +1924,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1899,8 +2001,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1931,23 +2034,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -1959,7 +2083,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -1975,6 +2100,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -2005,25 +2131,16 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
del_event_from_groups(event, ctx);
- /*
- * If event was in error state, then keep it
- * that way, otherwise bogus counts will be
- * returned on read(). The only way to get out
- * of error state is by explicit re-enabling
- * of the event
- */
- if (event->state > PERF_EVENT_STATE_OFF) {
- perf_cgroup_event_disable(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- }
-
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2039,14 +2156,13 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
-static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2063,7 +2179,7 @@ static void perf_put_aux_event(struct perf_event *event)
* If the event is an aux_event, tear down all links to
* it from other events.
*/
- for_each_sibling_event(iter, event->group_leader) {
+ for_each_sibling_event(iter, event) {
if (iter->aux_event != event)
continue;
@@ -2075,14 +2191,13 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
static bool perf_need_aux_event(struct perf_event *event)
{
- return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+ return event->attr.aux_output || has_aux_action(event);
}
static int perf_get_aux_event(struct perf_event *event,
@@ -2107,6 +2222,10 @@ static int perf_get_aux_event(struct perf_event *event,
!perf_aux_output_match(event, group_leader))
return 0;
+ if ((event->attr.aux_pause || event->attr.aux_resume) &&
+ !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return 0;
+
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
@@ -2126,23 +2245,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
-}
-
-/*
- * Events that have PERF_EV_CAP_SIBLING require being part of a group and
- * cannot exist on their own, schedule them out and move them into the ERROR
- * state. Also see _perf_event_enable(), it will not be able to recover
- * this ERROR state.
- */
-static inline void perf_remove_sibling_event(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
static void perf_group_detach(struct perf_event *event)
@@ -2169,6 +2273,7 @@ static void perf_group_detach(struct perf_event *event)
if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -2179,8 +2284,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
- perf_remove_sibling_event(sibling);
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2188,7 +2300,7 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
- if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
add_event_to_groups(sibling, event->ctx);
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
@@ -2219,7 +2331,11 @@ static void perf_child_detach(struct perf_event *event)
if (WARN_ON_ONCE(!parent_event))
return;
+ /*
+ * Can't check this from an IPI, the holder is likey another CPU.
+ *
lockdep_assert_held(&parent_event->child_mutex);
+ */
sync_child_event(event);
list_del_init(&event->child_list);
@@ -2230,47 +2346,27 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
- }
-
- return 1;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
+}
+
+static inline bool is_event_in_freq_mode(struct perf_event *event)
+{
+ return event->attr.freq && event->attr.sample_freq;
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2289,50 +2385,89 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
- if (event->attr.freq && event->attr.sample_freq)
+ cpc->active_oncpu--;
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ epc->nr_freq--;
+ }
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
+}
+
+static inline void
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, final);
+ }
+}
- perf_pmu_enable(ctx->pmu);
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ __ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ ctx_time_update(cpuctx, ctx);
+ if (ctx->is_active & EVENT_TIME)
+ ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
}
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2346,23 +2481,52 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- }
+ ctx_time_update(cpuctx, ctx);
+
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD)
+ state = PERF_EVENT_STATE_DEAD;
+
+ event_sched_out(event, ctx);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_disable(event, ctx);
+
+ perf_event_set_state(event, min(event->state, state));
- event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
+
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
+
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2393,7 +2557,7 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
*/
raw_spin_lock_irq(&ctx->lock);
if (!ctx->is_active) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2403,6 +2567,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
+}
+
/*
* Cross CPU call to disable a performance event
*/
@@ -2414,18 +2587,23 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_time_update_event(ctx, event);
+ /*
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
+ */
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- perf_cgroup_event_disable(event, ctx);
+ /*
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
+ */
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2438,7 +2616,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_event it's OK because event->ctx
+ * When called from perf_pending_disable it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2477,43 +2655,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
- irq_work_queue(&event->pending);
-}
-
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_disable_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2521,11 +2664,52 @@ static void perf_set_shadow_time(struct perf_event *event,
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->hw.interrupts = MAX_INTERRUPTS;
+ event->pmu->stop(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2549,15 +2733,11 @@ event_sched_in(struct perf_event *event,
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -2568,14 +2748,13 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
- if (event->attr.freq && event->attr.sample_freq)
+ cpc->active_oncpu++;
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2584,26 +2763,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2622,9 +2799,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2634,10 +2811,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2647,7 +2825,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2669,38 +2847,31 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ struct pmu *pmu,
enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct task_struct *task)
+ struct pmu *pmu)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
@@ -2720,10 +2891,10 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
*/
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
@@ -2732,11 +2903,17 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2746,21 +2923,28 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
+
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
- perf_event_sched_in(cpuctx, task_ctx, current);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+ }
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@@ -2774,7 +2958,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2816,9 +3000,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2849,7 +3034,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2865,7 +3050,8 @@ perf_install_in_context(struct perf_event_context *ctx,
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
@@ -2960,8 +3146,7 @@ static void __perf_event_enable(struct perf_event *event,
event->state <= PERF_EVENT_STATE_ERROR)
return;
- if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2969,25 +3154,21 @@ static void __perf_event_enable(struct perf_event *event,
if (!ctx->is_active)
return;
- if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ if (!event_filter_match(event))
return;
- }
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
- }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
@@ -3197,6 +3378,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
return err;
}
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
static int perf_event_modify_attr(struct perf_event *event,
struct perf_event_attr *attr)
{
@@ -3219,10 +3409,17 @@ static int perf_event_modify_attr(struct perf_event *event,
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
err = func(event, attr);
if (err)
goto out;
list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
err = func(child, attr);
if (err)
goto out;
@@ -3232,12 +3429,64 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
+
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!(event_type & EVENT_ALL))
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
+static void
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3251,16 +3500,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3271,35 +3510,36 @@ static void ctx_sched_out(struct perf_event_context *ctx,
*
* would only update time for the pinned events.
*/
- if (is_active & EVENT_TIME) {
- /* update (and stop) ctx time */
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- }
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
- is_active ^= ctx->is_active; /* changed bits */
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ ctx->is_active &= ~event_type;
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
+ if (!(ctx->is_active & EVENT_ALL)) {
+ /*
+ * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ * does not observe a hole. perf_ctx_unlock() will clean up.
+ */
+ if (ctx->is_active & EVENT_FROZEN)
+ ctx->is_active &= EVENT_TIME_FROZEN;
+ else
+ ctx->is_active = 0;
+ }
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!(ctx->is_active & EVENT_ALL))
+ cpuctx->task_ctx = NULL;
}
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ is_active ^= ctx->is_active; /* changed bits */
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3355,8 +3595,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -3404,26 +3643,33 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
+ struct task_struct *task, bool sched_in)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpc(pmu_ctx->pmu);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3448,36 +3694,41 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ perf_ctx_disable(ctx, false);
+
+ /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ if (local_read(&ctx->nr_no_switch_fast) ||
+ local_read(&next_ctx->nr_no_switch_fast)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- perf_pmu_disable(pmu);
+ perf_ctx_sched_task_cb(ctx, task, false);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
-
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
- * ctx->task and ctx->task_ctx_data are immaterial
- * since those values are always verified under
- * ctx->lock which we're now holding.
+ * ctx->task is immaterial since this value is
+ * always verified under ctx->lock which we're now
+ * holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3491,37 +3742,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx, false);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+inside_switch:
+ perf_ctx_sched_task_cb(ctx, task, false);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3533,19 +3787,22 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
+ struct task_struct *task, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, task, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3555,26 +3812,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3589,36 +3840,23 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
+ perf_cgroup_switch(next);
}
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
-}
-
-static bool perf_less_group_idx(const void *l, const void *r)
+static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
{
const struct perf_event *le = *(const struct perf_event **)l;
const struct perf_event *re = *(const struct perf_event **)r;
@@ -3626,20 +3864,14 @@ static bool perf_less_group_idx(const void *l, const void *r)
return le->group_index < re->group_index;
}
-static void swap_ptr(void *l, void *r)
-{
- void **lp = l, **rp = r;
-
- swap(*lp, *rp);
-}
+DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);
static const struct min_heap_callbacks perf_min_heap = {
- .elem_size = sizeof(struct perf_event *),
.less = perf_less_group_idx,
- .swp = swap_ptr,
+ .swp = NULL,
};
-static void __heap_add(struct min_heap *heap, struct perf_event *event)
+static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
{
struct perf_event **itrs = heap->data;
@@ -3649,22 +3881,40 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpc(pmu_ctx->pmu);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
- struct min_heap event_heap;
+ struct perf_event_min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
- event_heap = (struct min_heap){
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ event_heap = (struct perf_event_min_heap){
.data = cpuctx->heap,
.nr = 0,
.size = cpuctx->heap_size,
@@ -3677,47 +3927,58 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
css = &cpuctx->cgrp->css;
#endif
} else {
- event_heap = (struct min_heap){
+ event_heap = (struct perf_event_min_heap){
.data = itrs,
.nr = 0,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
- min_heapify_all(&event_heap, &perf_min_heap);
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
+ min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);
while (event_heap.nr) {
ret = func(*evt, data);
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
- min_heapify(&event_heap, 0, &perf_min_heap);
+ min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
else
- min_heap_pop(&event_heap, &perf_min_heap);
+ min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
}
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
static inline bool event_update_userpage(struct perf_event *event)
{
- if (likely(!atomic_read(&event->mmap_count)))
+ if (likely(!refcount_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_update_userpage(event);
return true;
@@ -3737,7 +3998,6 @@ static inline void group_update_userpage(struct perf_event *group_event)
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3746,8 +4006,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
@@ -3756,9 +4016,16 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ if (*perf_event_fasync(event))
+ event->pending_kill = POLL_ERR;
+
+ perf_event_wakeup(event);
} else {
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
+
+ event->pmu_ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
}
@@ -3766,51 +4033,55 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
int can_add_hw = 1;
-
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
merge_sched_in, &can_add_hw);
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- int can_add_hw = 1;
+ struct perf_event_context *ctx = pmu_ctx->ctx;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
- u64 now;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (!(is_active & EVENT_TIME)) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
- if (!is_active)
+ if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -3818,52 +4089,41 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
-
- ctx_sched_in(ctx, cpuctx, event_type, task);
-}
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
- cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx, false);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
+ perf_ctx_sched_task_cb(ctx, task, true);
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx, false);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3874,7 +4134,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3883,17 +4143,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx, false);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
+ }
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_event_sched_in(cpuctx, ctx, NULL);
- perf_pmu_enable(pmu);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
+
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx, false);
+
+ perf_ctx_enable(ctx, false);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3910,26 +4177,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- /*
- * If cgroup events exist on this CPU, then we need to check if we have
- * to switch in PMU state; cgroup event are system-wide mode only.
- *
- * Since cgroup events are CPU events, we must schedule these in before
- * we schedule in the task events.
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4023,7 +4271,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
- delta = (delta + 7) / 8; /* low pass filter */
+ if (delta >= 0)
+ delta += 7;
+ else
+ delta -= 7;
+ delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
@@ -4043,49 +4295,28 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || needs_unthr))
- return;
-
- raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
- if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ if (!is_event_in_freq_mode(event))
+ continue;
/*
* stop the event and update event->count
@@ -4107,11 +4338,43 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (!perf_pmu_ctx_is_active(pmu_ctx))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4133,72 +4396,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
+ }
- perf_event_sched_in(cpuctx, task_ctx, current);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
+
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4206,8 +4506,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4216,8 +4516,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4239,9 +4544,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4249,13 +4554,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_events)
+ goto out;
+
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4266,9 +4574,7 @@ static void perf_event_enable_on_exec(int ctxn)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
- } else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4281,24 +4587,20 @@ out:
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
- LIST_HEAD(free_list);
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4313,19 +4615,17 @@ static void perf_event_remove_on_exec(int ctxn)
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4336,13 +4636,24 @@ struct perf_read_data {
int ret;
};
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
+ int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- int local_cpu = smp_processor_id();
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
@@ -4361,7 +4672,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4375,10 +4686,7 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (data->group)
@@ -4397,15 +4705,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -4413,11 +4714,26 @@ unlock:
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
{
+ if (self)
+ return local64_read(&event->count);
+
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4430,6 +4746,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{
unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
int ret = 0;
/*
@@ -4454,15 +4772,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
/* If this is a per-CPU event, it must be for this CPU */
if (!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id()) {
+ event_cpu != smp_processor_id()) {
ret = -EINVAL;
goto out;
}
/* If this is a pinned event it must be running on this CPU */
- if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY;
goto out;
}
@@ -4472,15 +4797,14 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -4555,10 +4879,7 @@ again:
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (group)
@@ -4576,17 +4897,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4597,7 +4926,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4626,23 +4954,20 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
- err = perf_allow_cpu(&event->attr);
+ err = perf_allow_cpu();
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4653,43 +4978,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4698,12 +5002,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4716,21 +5020,149 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
+
+ if (!ctx->task) {
+ /*
+ * perf_pmu_migrate_context() / __perf_pmu_install_event()
+ * relies on the fact that find_get_pmu_context() cannot fail
+ * for CPU contexts.
+ */
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+ raw_spin_lock_irq(&ctx->lock);
+ if (!epc->ctx) {
+ /*
+ * One extra reference for the pmu; see perf_pmu_free().
+ */
+ atomic_set(&epc->refcount, 2);
+ epc->embedded = 1;
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ /* Make sure the pmu_ctx_list is sorted by PMU type: */
+ if (!pos && epc->pmu->type > pmu->type)
+ pos = epc;
+ }
+
+ epc = new;
+ new = NULL;
+
+ if (!pos)
+ list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ else
+ list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
+
+ epc->ctx = ctx;
+
+found_epc:
+ raw_spin_unlock_irq(&ctx->lock);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_cpc_rcu(struct rcu_head *head)
+{
+ struct perf_cpu_pmu_context *cpc =
+ container_of(head, typeof(*cpc), epc.rcu_head);
+
+ kfree(cpc);
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ struct perf_event_context *ctx = epc->ctx;
+ unsigned long flags;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
+ return;
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (epc->embedded) {
+ call_rcu(&epc->rcu_head, free_cpc_rcu);
+ return;
+ }
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4765,6 +5197,7 @@ static bool is_sb_event(struct perf_event *event)
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -4774,15 +5207,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
detach_sb_event(event);
}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
-}
-
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
@@ -4805,6 +5229,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+ struct perf_ctx_data *cd;
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd)
+ return NULL;
+
+ cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ if (!cd->data) {
+ kfree(cd);
+ return NULL;
+ }
+
+ cd->global = global;
+ cd->ctx_cache = ctx_cache;
+ refcount_set(&cd->refcount, 1);
+
+ return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+ kmem_cache_free(cd->ctx_cache, cd->data);
+ kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *cd;
+
+ cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+ call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+ bool global)
+{
+ struct perf_ctx_data *cd, *old = NULL;
+
+ cd = alloc_perf_ctx_data(ctx_cache, global);
+ if (!cd)
+ return -ENOMEM;
+
+ for (;;) {
+ if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (old)
+ perf_free_ctx_data_rcu(old);
+ return 0;
+ }
+
+ if (!old) {
+ /*
+ * After seeing a dead @old, we raced with
+ * removal and lost, try again to install @cd.
+ */
+ continue;
+ }
+
+ if (refcount_inc_not_zero(&old->refcount)) {
+ free_perf_ctx_data(cd); /* unused */
+ return 0;
+ }
+
+ /*
+ * @old is a dead object, refcount==0 is stable, try and
+ * replace it with @cd.
+ */
+ }
+ return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+ int ret;
+
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+again:
+ /* Allocate everything */
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (cd && !cd->global) {
+ cd->global = 1;
+ if (!refcount_inc_not_zero(&cd->refcount))
+ cd = NULL;
+ }
+ if (!cd) {
+ get_task_struct(p);
+ goto alloc;
+ }
+ }
+ }
+
+ refcount_set(&global_ctx_data_ref, 1);
+
+ return 0;
+alloc:
+ ret = attach_task_ctx_data(p, ctx_cache, true);
+ put_task_struct(p);
+ if (ret) {
+ __detach_global_ctx_data();
+ return ret;
+ }
+ goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+ int ret;
+
+ if (!ctx_cache)
+ return -ENOMEM;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache, false);
+
+ ret = attach_global_ctx_data(ctx_cache);
+ if (ret)
+ return ret;
+
+ event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+ return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+ struct perf_ctx_data *cd;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !refcount_dec_and_test(&cd->refcount))
+ return;
+ }
+
+ /*
+ * The old ctx_data may be lost because of the race.
+ * Nothing is required to do for the case.
+ * See attach_task_ctx_data().
+ */
+ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+
+again:
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !cd->global)
+ continue;
+ cd->global = 0;
+ get_task_struct(p);
+ goto detach;
+ }
+ }
+ return;
+detach:
+ detach_task_ctx_data(p);
+ put_task_struct(p);
+ goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+ if (refcount_dec_not_one(&global_ctx_data_ref))
+ return;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (!refcount_dec_and_test(&global_ctx_data_ref))
+ return;
+
+ /* remove everything */
+ __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+ if (task)
+ return detach_task_ctx_data(task);
+
+ if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+ detach_global_ctx_data();
+ event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+ }
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -4848,8 +5491,6 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}
- unaccount_event_cpu(event, event->cpu);
-
unaccount_pmu_sb_event(event);
}
@@ -4868,7 +5509,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -4901,6 +5542,8 @@ static int exclusive_event_init(struct perf_event *event)
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -4908,14 +5551,13 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!is_exclusive_pmu(pmu))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -4947,40 +5589,26 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void perf_addr_filters_splice(struct perf_event *event,
- struct list_head *head);
+static void perf_free_addr_filters(struct perf_event *event);
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ struct pmu *pmu = event->pmu;
- unaccount_event(event);
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
- security_perf_event_free(event);
+ kfree(event->addr_filter_ranges);
- if (event->rb) {
- /*
- * Can happen when we close an event with re-directed output.
- *
- * Since we have a 0 refcount, perf_mmap_close() will skip
- * over us; possibly making our ring_buffer_put() the last.
- */
- mutex_lock(&event->mmap_mutex);
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- }
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filter_ranges);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
@@ -4992,28 +5620,74 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!pmu);
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
+ put_pmu_ctx(event->pmu_ctx);
+ }
+
/*
- * perf_event_free_task() relies on put_ctx() being 'last', in particular
- * all task references must be cleaned up.
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
*/
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
+DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
+
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+
+ unaccount_event(event);
+
+ security_perf_event_free(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ __free_event(event);
+}
+
/*
* Used to free events which have a known refcount of 1, such as in error paths
- * where the event isn't exposed yet and inherited events.
+ * of inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
- "unexpected event refcount: %ld; ptr=%p\n",
- atomic_long_read(&event->refcount), event)) {
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
@@ -5074,10 +5748,17 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
+ struct perf_event *parent;
+
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ parent = event->parent;
_free_event(event);
+
+ /* Matches the refcount bump in inherit_event() */
+ if (parent)
+ put_event(parent);
}
/*
@@ -5089,11 +5770,10 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
- LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5106,9 +5786,7 @@ int perf_event_release_kernel(struct perf_event *event)
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
- raw_spin_lock_irq(&ctx->lock);
/*
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
@@ -5120,15 +5798,17 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
-
/*
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
@@ -5161,38 +5841,30 @@ again:
tmp = list_first_entry_or_null(&event->child_list,
struct perf_event, child_list);
if (tmp == child) {
- perf_remove_from_context(child, DETACH_GROUP);
- list_move(&child->child_list, &free_list);
- /*
- * This matches the refcount bump in inherit_event();
- * this can't be the last reference.
- */
- put_event(event);
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
+ } else {
+ child = NULL;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
+
+ if (child) {
+ /* Last reference unless ->pending_task work is pending */
+ put_event(child);
+ }
put_ctx(ctx);
+
goto again;
}
mutex_unlock(&event->child_mutex);
- list_for_each_entry_safe(child, tmp, &free_list, child_list) {
- void *var = &child->ctx->refcount;
-
- list_del(&child->child_list);
- free_event(child);
-
- /*
- * Wake any perf_event_free_task() waiting for this event to be
- * freed.
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
- }
-
no_ctx:
- put_event(event); /* Must be the 'last' reference */
+ /*
+ * Last reference unless ->pending_task work is pending on this event
+ * or any of its children.
+ */
+ put_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -5217,7 +5889,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -5226,7 +5898,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -5252,7 +5924,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -5262,6 +5934,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -5281,18 +5980,23 @@ static int __perf_read_group_add(struct perf_event *leader,
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -5311,10 +6015,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -5345,7 +6045,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5355,6 +6055,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -5428,11 +6130,21 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
+ if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
+ event->attr.pinned))
+ return EPOLLERR;
+
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
@@ -5522,15 +6234,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
+ perf_pmu_disable(event->pmu);
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -5538,7 +6242,15 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
+ perf_pmu_enable(event->pmu);
}
}
@@ -5555,14 +6267,15 @@ static int _perf_event_period(struct perf_event *event, u64 value)
if (!value)
return -EINVAL;
- if (event->attr.freq && value > sysctl_perf_event_sample_rate)
- return -EINVAL;
-
- if (perf_event_check_period(event, value))
- return -EINVAL;
-
- if (!event->attr.freq && (value & (1ULL << 63)))
- return -EINVAL;
+ if (event->attr.freq) {
+ if (value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+ } else {
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+ if (value & (1ULL << 63))
+ return -EINVAL;
+ }
event_function_call(event, __perf_event_period, &value);
@@ -5584,18 +6297,9 @@ EXPORT_SYMBOL_GPL(perf_event_period);
static const struct file_operations perf_fops;
-static inline int perf_fget_light(int fd, struct fd *p)
+static inline bool is_perf_file(struct fd f)
{
- struct fd f = fdget(fd);
- if (!f.file)
- return -EBADF;
-
- if (f.file->f_op != &perf_fops) {
- fdput(f);
- return -EBADF;
- }
- *p = f;
- return 0;
+ return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}
static int perf_event_set_output(struct perf_event *event,
@@ -5603,12 +6307,18 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -5643,20 +6353,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_OUTPUT:
{
- int ret;
+ CLASS(fd, output)(arg); // arg == -1 => empty
+ struct perf_event *output_event = NULL;
if (arg != -1) {
- struct perf_event *output_event;
- struct fd output;
- ret = perf_fget_light(arg, &output);
- if (ret)
- return ret;
- output_event = output.file->private_data;
- ret = perf_event_set_output(event, output_event);
- fdput(output);
- } else {
- ret = perf_event_set_output(event, NULL);
+ if (!is_perf_file(output))
+ return -EBADF;
+ output_event = fd_file(output)->private_data;
}
- return ret;
+ return perf_event_set_output(event, output_event);
}
case PERF_EVENT_IOC_SET_FILTER:
@@ -5671,7 +6375,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = perf_event_set_bpf_prog(event, prog, 0);
+ err = __perf_event_set_bpf_prog(event, prog, 0);
if (err) {
bpf_prog_put(prog);
return err;
@@ -5802,18 +6506,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -5877,7 +6569,7 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
@@ -5897,47 +6589,14 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
-static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
-{
- struct perf_event *event = vmf->vma->vm_file->private_data;
- struct perf_buffer *rb;
- vm_fault_t ret = VM_FAULT_SIGBUS;
-
- if (vmf->flags & FAULT_FLAG_MKWRITE) {
- if (vmf->pgoff == 0)
- ret = 0;
- return ret;
- }
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
- goto unlock;
-
- vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
- if (!vmf->page)
- goto unlock;
-
- get_page(vmf->page);
- vmf->page->mapping = vmf->vma->vm_file->f_mapping;
- vmf->page->index = vmf->pgoff;
-
- ret = 0;
-unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *rb)
{
struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -5995,6 +6654,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6008,6 +6670,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6029,18 +6694,31 @@ void ring_buffer_put(struct perf_buffer *rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
- atomic_inc(&event->mmap_count);
- atomic_inc(&event->rb->mmap_count);
+ refcount_inc(&event->mmap_count);
+ refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
- atomic_inc(&event->rb->aux_mmap_count);
+ refcount_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -6056,22 +6734,23 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
- if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event, vma->vm_mm);
+ /* FIXIES vs perf_pmu_unregister() */
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6088,13 +6767,13 @@ static void perf_mmap_close(struct vm_area_struct *vma)
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
- mutex_unlock(&event->mmap_mutex);
+ mutex_unlock(&rb->aux_mutex);
}
- if (atomic_dec_and_test(&rb->mmap_count))
+ if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
- if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
@@ -6164,101 +6843,148 @@ out_put:
ring_buffer_put(rb); /* could be last */
}
+static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
+{
+ /* The first page is the user control page, others are read-only. */
+ return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
+}
+
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
- .fault = perf_mmap_fault,
- .page_mkwrite = perf_mmap_fault,
+ .pfn_mkwrite = perf_mmap_pfn_mkwrite,
+ .may_split = perf_mmap_may_split,
};
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
- struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
- struct user_struct *user = current_user();
- struct perf_buffer *rb = NULL;
- unsigned long locked, lock_limit;
- unsigned long vma_size;
- unsigned long nr_pages;
- long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ unsigned long nr_pages = vma_pages(vma);
+ int err = 0;
+ unsigned long pagenum;
/*
- * Don't allow mmap() of inherited per-task counters. This would
- * create a performance issue due to all children writing to the
- * same rb.
+ * We map this as a VM_PFNMAP VMA.
+ *
+ * This is not ideal as this is designed broadly for mappings of PFNs
+ * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
+ * !pfn_valid(pfn).
+ *
+ * We are mapping kernel-allocated memory (memory we manage ourselves)
+ * which would more ideally be mapped using vm_insert_page() or a
+ * similar mechanism, that is as a VM_MIXEDMAP mapping.
+ *
+ * However this won't work here, because:
+ *
+ * 1. It uses vma->vm_page_prot, but this field has not been completely
+ * setup at the point of the f_op->mmp() hook, so we are unable to
+ * indicate that this should be mapped CoW in order that the
+ * mkwrite() hook can be invoked to make the first page R/W and the
+ * rest R/O as desired.
+ *
+ * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
+ * vm_normal_page() returning a struct page * pointer, which means
+ * vm_ops->page_mkwrite() will be invoked rather than
+ * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
+ * to work around retry logic in the fault handler, however this
+ * field is no longer allowed to be used within struct page.
+ *
+ * 3. Having a struct page * made available in the fault logic also
+ * means that the page gets put on the rmap and becomes
+ * inappropriately accessible and subject to map and ref counting.
+ *
+ * Ideally we would have a mechanism that could explicitly express our
+ * desires, but this is not currently the case, so we instead use
+ * VM_PFNMAP.
+ *
+ * We manage the lifetime of these mappings with internal refcounts (see
+ * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
+ * this mapping is maintained correctly.
*/
- if (event->cpu == -1 && event->attr.inherit)
- return -EINVAL;
-
- if (!(vma->vm_flags & VM_SHARED))
- return -EINVAL;
-
- ret = security_perf_event_read(event);
- if (ret)
- return ret;
-
- vma_size = vma->vm_end - vma->vm_start;
-
- if (vma->vm_pgoff == 0) {
- nr_pages = (vma_size / PAGE_SIZE) - 1;
- } else {
- /*
- * AUX area mapping: if rb->aux_nr_pages != 0, it's already
- * mapped, all subsequent mappings should have the same size
- * and offset. Must be above the normal perf buffer.
- */
- u64 aux_offset, aux_size;
+ for (pagenum = 0; pagenum < nr_pages; pagenum++) {
+ unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
+ struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
- if (!event->rb)
- return -EINVAL;
+ if (page == NULL) {
+ err = -EINVAL;
+ break;
+ }
- nr_pages = vma_size / PAGE_SIZE;
+ /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
+ err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
+ vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
+ if (err)
+ break;
+ }
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
+#ifdef CONFIG_MMU
+ /* Clear any partial mappings on error. */
+ if (err)
+ zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
+#endif
- rb = event->rb;
- if (!rb)
- goto aux_unlock;
+ return err;
+}
- aux_offset = READ_ONCE(rb->user_page->aux_offset);
- aux_size = READ_ONCE(rb->user_page->aux_size);
+static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
+{
+ unsigned long user_locked, user_lock_limit, locked, lock_limit;
+ struct user_struct *user = current_user();
- if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
- goto aux_unlock;
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ /* Increase the limit linearly with more CPUs */
+ user_lock_limit *= num_online_cpus();
- if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
- goto aux_unlock;
+ user_locked = atomic_long_read(&user->locked_vm);
- /* already mapped with a different offset */
- if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
- goto aux_unlock;
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += *user_extra;
- if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
- goto aux_unlock;
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
+ *extra = user_locked - user_lock_limit;
+ *user_extra -= *extra;
+ }
- /* already mapped with a different size */
- if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
- goto aux_unlock;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
- if (!is_power_of_2(nr_pages))
- goto aux_unlock;
+ return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
+}
- if (!atomic_inc_not_zero(&rb->mmap_count))
- goto aux_unlock;
+static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
+{
+ struct user_struct *user = current_user();
- if (rb_has_aux(rb)) {
- atomic_inc(&rb->aux_mmap_count);
- ret = 0;
- goto unlock;
- }
+ atomic_long_add(user_extra, &user->locked_vm);
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
+}
- atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
+static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ struct perf_buffer *rb;
+ int rb_flags = 0;
- goto accounting;
- }
+ nr_pages -= 1;
/*
* If we have rb pages ensure they're a power-of-two number, so we
@@ -6267,123 +6993,203 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (nr_pages != 0 && !is_power_of_2(nr_pages))
return -EINVAL;
- if (vma_size != PAGE_SIZE * (1 + nr_pages))
- return -EINVAL;
-
WARN_ON_ONCE(event->ctx->parent_ctx);
-again:
- mutex_lock(&event->mmap_mutex);
+
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
- ret = -EINVAL;
- goto unlock;
- }
+ if (data_page_nr(event->rb) != nr_pages)
+ return -EINVAL;
- if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
*/
- mutex_unlock(&event->mmap_mutex);
- goto again;
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
}
- goto unlock;
+ /*
+ * Raced against perf_mmap_close()'s
+ * refcount_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
+ */
+ ring_buffer_attach(event, NULL);
}
- user_extra = nr_pages + 1;
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
+ return -EPERM;
-accounting:
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- /*
- * Increase the limit linearly with more CPUs:
- */
- user_lock_limit *= num_online_cpus();
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, rb_flags);
- user_locked = atomic_long_read(&user->locked_vm);
+ if (!rb)
+ return -ENOMEM;
+
+ refcount_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_update_time(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_set(&event->mmap_count, 1);
+
+ return 0;
+}
+
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ u64 aux_offset, aux_size;
+ struct perf_buffer *rb;
+ int ret, rb_flags = 0;
+
+ rb = event->rb;
+ if (!rb)
+ return -EINVAL;
+
+ guard(mutex)(&rb->aux_mutex);
/*
- * sysctl_perf_event_mlock may have changed, so that
- * user->locked_vm > user_lock_limit
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
*/
- if (user_locked > user_lock_limit)
- user_locked = user_lock_limit;
- user_locked += user_extra;
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
- if (user_locked > user_lock_limit) {
- /*
- * charge locked_vm until it hits user_lock_limit;
- * charge the rest from pinned_vm
- */
- extra = user_locked - user_lock_limit;
- user_extra -= extra;
- }
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ return -EINVAL;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ return -EINVAL;
- if ((locked > lock_limit) && perf_is_paranoid() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ return -EINVAL;
+
+ if (aux_size != nr_pages * PAGE_SIZE)
+ return -EINVAL;
- WARN_ON(!rb && event->rb);
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ return -EINVAL;
- if (vma->vm_flags & VM_WRITE)
- flags |= RING_BUFFER_WRITABLE;
+ if (!is_power_of_2(nr_pages))
+ return -EINVAL;
- if (!rb) {
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
+ if (!refcount_inc_not_zero(&rb->mmap_count))
+ return -EINVAL;
- if (!rb) {
- ret = -ENOMEM;
- goto unlock;
+ if (rb_has_aux(rb)) {
+ refcount_inc(&rb->aux_mmap_count);
+
+ } else {
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+ refcount_dec(&rb->mmap_count);
+ return -EPERM;
}
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;
+ WARN_ON(!rb && event->rb);
- ring_buffer_attach(event, rb);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
- } else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
- event->attr.aux_watermark, flags);
- if (!ret)
- rb->aux_mmap_locked = extra;
+ event->attr.aux_watermark, rb_flags);
+ if (ret) {
+ refcount_dec(&rb->mmap_count);
+ return ret;
+ }
+
+ refcount_set(&rb->aux_mmap_count, 1);
+ rb->aux_mmap_locked = extra;
}
-unlock:
- if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- atomic64_add(extra, &vma->vm_mm->pinned_vm);
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
- atomic_inc(&event->mmap_count);
- } else if (rb) {
- atomic_dec(&rb->mmap_count);
+ return 0;
+}
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long vma_size, nr_pages;
+ mapped_f mapped;
+ int ret;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ scoped_guard (mutex, &event->mmap_mutex) {
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ if (vma->vm_pgoff == 0)
+ ret = perf_mmap_rb(vma, event, nr_pages);
+ else
+ ret = perf_mmap_aux(vma, event, nr_pages);
+ if (ret)
+ return ret;
}
-aux_unlock:
- mutex_unlock(&event->mmap_mutex);
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
return ret;
}
@@ -6394,6 +7200,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -6405,7 +7214,6 @@ static int perf_fasync(int fd, struct file *filp, int on)
}
static const struct file_operations perf_fops = {
- .llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
@@ -6422,14 +7230,6 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
-static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
-{
- /* only the parent has fasync state */
- if (event->parent)
- event = event->parent;
- return &event->fasync;
-}
-
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
@@ -6443,40 +7243,46 @@ void perf_event_wakeup(struct perf_event *event)
static void perf_sigtrap(struct perf_event *event)
{
/*
- * We'd expect this to only occur if the irq_work is delayed and either
- * ctx->task or current has changed in the meantime. This can be the
- * case on architectures that do not implement arch_irq_work_raise().
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
- if (WARN_ON_ONCE(event->ctx->task != current))
+ if (current->flags & PF_EXITING)
return;
/*
- * perf_pending_event() can race with the task exiting.
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
*/
- if (current->flags & PF_EXITING)
+ if (WARN_ON_ONCE(event->ctx->task != current))
return;
- force_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ send_sig_perf((void __user *)event->pending_addr,
+ event->orig_type, event->attr.sig_data);
}
-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_disable(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
if (cpu < 0)
return;
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
- perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
}
-
- perf_event_disable_local(event);
return;
}
@@ -6484,38 +7290,55 @@ static void perf_pending_event_disable(struct perf_event *event)
* CPU-A CPU-B
*
* perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
+ * @pending_disable = 1;
* irq_work_queue();
*
* sched-out
- * @pending_disable = -1;
+ * @pending_disable = 0;
*
* sched-in
* perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
+ * @pending_disable = 1;
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_event()
+ * perf_pending_disable()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending, cpu);
+ irq_work_queue_on(&event->pending_disable_irq, cpu);
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_pending_disable(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry, struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
+ __perf_pending_disable(event);
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
+static void perf_pending_irq(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
+ int rctx;
- perf_pending_event_disable(event);
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
@@ -6525,6 +7348,28 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_no_switch_fast);
+ }
+ put_event(event);
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
@@ -6563,6 +7408,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif
+static bool should_sample_guest(struct perf_event *event)
+{
+ return !event->attr.exclude_guest && perf_guest_state();
+}
+
+unsigned long perf_misc_flags(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_arch_guest_misc_flags(regs);
+
+ return perf_arch_misc_flags(regs);
+}
+
+unsigned long perf_instruction_pointer(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_guest_get_ip();
+
+ return perf_arch_instruction_pointer(regs);
+}
+
static void
perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
@@ -6585,7 +7453,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (!(current->flags & PF_KTHREAD)) {
+ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -6628,6 +7496,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -6669,7 +7541,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6687,9 +7558,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6717,7 +7586,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
goto out;
@@ -6783,7 +7652,7 @@ static void perf_aux_sample_output(struct perf_event *event,
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
return;
@@ -6815,14 +7684,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event)
-{
- u64 sample_type = event->attr.sample_type;
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
- data->type = sample_type;
- header->size += event->id_header_size;
+static void __perf_event_header__init_id(struct perf_sample_data *data,
+ struct perf_event *event,
+ u64 sample_type)
+{
+ data->type = event->attr.sample_type;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -6849,8 +7724,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6890,10 +7767,10 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -6904,18 +7781,28 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ unsigned long flags;
+ u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
+
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
values[n++] = 1 + leader->nr_siblings;
@@ -6925,29 +7812,33 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
@@ -6959,6 +7850,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
@@ -6984,11 +7879,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -6996,6 +7886,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -7077,9 +7970,17 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
+ /*
+ * Add the extension space which is appended
+ * right after the struct perf_branch_stack.
+ */
+ if (data->br_stack_cntr) {
+ size = data->br_stack->nr * sizeof(u64);
+ perf_output_copy(handle, data->br_stack_cntr, size);
+ }
} else {
/*
* we always store at least the value of nr
@@ -7192,7 +8093,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
- if (current->mm != NULL) {
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
struct page *p;
pagefault_disable();
@@ -7214,7 +8115,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
u64 size = 0;
-#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_HAVE_GUP_FAST
pgd_t *pgdp, pgd;
p4d_t *p4dp, p4d;
pud_t *pudp, pud;
@@ -7246,7 +8147,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
- pmd = READ_ONCE(*pmdp);
+again:
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7254,11 +8156,14 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pmd_leaf_size(pmd);
ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ goto again;
+
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
- size = pte_leaf_size(pte);
+ size = __pte_leaf_size(pmd, pte);
pte_unmap(ptep);
-#endif /* CONFIG_HAVE_FAST_GUP */
+#endif /* CONFIG_HAVE_GUP_FAST */
return size;
}
@@ -7296,94 +8201,100 @@ static u64 perf_get_page_size(unsigned long addr)
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+static struct unwind_work perf_unwind_work;
+
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
+ bool user = !event->attr.exclude_callchain_user &&
+ !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+ event->attr.defer_callchain;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ u64 defer_cookie;
+
+ if (!current->mm)
+ user = false;
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, 0, kernel, user,
- max_stack, crosstask, true);
+ if (!(user && defer_user && !crosstask &&
+ unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+ defer_cookie = 0;
+
+ callchain = get_perf_callchain(regs, kernel, user, max_stack,
+ crosstask, true, defer_cookie);
+
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
-
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
-
- __perf_event_header__init_id(header, data, event);
-
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
-
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
- data->callchain = perf_callchain(event, regs);
-
- size += data->callchain->nr;
+ /*
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
+ */
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- header->size += size * sizeof(u64);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- if (raw) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
-
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(event, regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- }
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7392,10 +8303,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7403,9 +8315,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7417,10 +8330,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7432,19 +8366,23 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7453,16 +8391,21 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_AUX) {
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7470,14 +8413,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(event, regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7489,6 +8444,49 @@ void perf_prepare_sample(struct perf_event_header *header,
WARN_ON_ONCE(header->size & 7);
}
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ if (pause) {
+ if (!event->hw.aux_paused) {
+ event->hw.aux_paused = 1;
+ event->pmu->stop(event, PERF_EF_PAUSE);
+ }
+ } else {
+ if (event->hw.aux_paused) {
+ event->hw.aux_paused = 0;
+ event->pmu->start(event, PERF_EF_RESUME);
+ }
+ }
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ struct perf_buffer *rb;
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ rb = ring_buffer_get(event);
+ if (!rb)
+ return;
+
+ scoped_guard (irqsave) {
+ /*
+ * Guard against self-recursion here. Another event could trip
+ * this same from NMI context.
+ */
+ if (READ_ONCE(rb->aux_in_pause_resume))
+ break;
+
+ WRITE_ONCE(rb->aux_in_pause_resume, 1);
+ barrier();
+ __perf_event_aux_pause(event, pause);
+ barrier();
+ WRITE_ONCE(rb->aux_in_pause_resume, 0);
+ }
+ ring_buffer_put(rb);
+}
+
static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
@@ -7505,7 +8503,8 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
err = output_begin(&handle, data, event, header.size);
if (err)
@@ -7638,7 +8637,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7655,11 +8653,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7701,20 +8697,18 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ scoped_guard(rcu)
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7754,8 +8748,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -7904,10 +8897,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *cd;
+
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(parent->perf_ctx_data);
+ if (cd)
+ ctx_cache = cd->ctx_cache;
+ }
+
+ if (!ctx_cache)
+ return;
+
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ scoped_guard (rcu) {
+ cd = rcu_dereference(child->perf_ctx_data);
+ if (!cd) {
+ /*
+ * A system-wide event may be unaccount,
+ * when attaching the perf_ctx_data.
+ */
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+ goto attach;
+ }
+
+ if (!cd->global) {
+ cd->global = 1;
+ refcount_inc(&cd->refcount);
+ }
+ }
+
+ return;
+attach:
+ attach_task_ctx_data(child, ctx_cache, true);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -7971,7 +9012,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm);
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -8353,7 +9394,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
@@ -8373,7 +9414,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_HUGETLB;
if (file) {
- struct inode *inode;
+ const struct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -8386,12 +9427,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = file_path(file, buf, PATH_MAX - sizeof(u64));
+ name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
}
- inode = file_inode(vma->vm_file);
+ inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
@@ -8400,33 +9441,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
- }
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name);
name = tmp;
got_name:
/*
@@ -8453,7 +9483,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
if (atomic_read(&nr_build_id_events))
- build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
@@ -8473,7 +9503,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
if (!filter->path.dentry)
return false;
- if (d_inode(filter->path.dentry) != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_user_inode(file))
return false;
if (filter->offset > offset + size)
@@ -8544,7 +9574,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8554,13 +9583,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -8746,7 +9771,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- if (!sched_in && task->on_rq) {
+ if (!sched_in && task_is_runnable(task)) {
switch_event.event_id.header.misc |=
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
}
@@ -8855,7 +9880,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strlcpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -8918,7 +9943,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, data, event,
+ ret = perf_output_begin(&handle, &sample, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -8935,21 +9960,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
int i;
- if (prog->aux->func_cnt == 0) {
- perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
- (u64)(unsigned long)prog->bpf_func,
- prog->jited_len, unregister,
- prog->aux->ksym.name);
- } else {
- for (i = 0; i < prog->aux->func_cnt; i++) {
- struct bpf_prog *subprog = prog->aux->func[i];
-
- perf_event_ksymbol(
- PERF_RECORD_KSYMBOL_TYPE_BPF,
- (u64)(unsigned long)subprog->bpf_func,
- subprog->jited_len, unregister,
- prog->aux->ksym.name);
- }
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister,
+ prog->aux->ksym.name);
+
+ for (i = 1; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister,
+ subprog->aux->ksym.name);
}
}
@@ -8959,10 +9982,6 @@ void perf_event_bpf_event(struct bpf_prog *prog,
{
struct perf_bpf_event bpf_event;
- if (type <= PERF_BPF_EVENT_UNKNOWN ||
- type >= PERF_BPF_EVENT_MAX)
- return;
-
switch (type) {
case PERF_BPF_EVENT_PROG_LOAD:
case PERF_BPF_EVENT_PROG_UNLOAD:
@@ -8970,7 +9989,7 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_event_bpf_emit_ksymbols(prog, type);
break;
default:
- break;
+ return;
}
if (!atomic_read(&nr_bpf_events))
@@ -8995,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_callchain_deferred_event {
+ struct unwind_stacktrace *trace;
+ struct {
+ struct perf_event_header header;
+ u64 cookie;
+ u64 nr;
+ u64 ips[];
+ } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+ struct perf_callchain_deferred_event *deferred_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret, size = deferred_event->event.header.size;
+
+ if (!event->attr.defer_output)
+ return;
+
+ /* XXX do we really need sample_id_all for this ??? */
+ perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ deferred_event->event.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, deferred_event->event);
+ for (int i = 0; i < deferred_event->trace->nr; i++) {
+ u64 entry = deferred_event->trace->entries[i];
+ perf_output_put(&handle, entry);
+ }
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+ struct unwind_stacktrace *trace, u64 cookie)
+{
+ struct perf_callchain_deferred_event deferred_event = {
+ .trace = trace,
+ .event = {
+ .header = {
+ .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+ .misc = PERF_RECORD_MISC_USER,
+ .size = sizeof(deferred_event.event) +
+ (trace->nr * sizeof(u64)),
+ },
+ .cookie = cookie,
+ .nr = trace->nr,
+ },
+ };
+
+ perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
struct perf_text_poke_event {
const void *old_bytes;
const void *new_bytes;
@@ -9081,7 +10160,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes,
void perf_event_itrace_started(struct perf_event *event)
{
- event->attach_state |= PERF_ATTACH_ITRACE;
+ WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
}
static void perf_log_itrace_start(struct perf_event *event)
@@ -9149,6 +10228,7 @@ void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
perf_output_end(&handle);
}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
@@ -9163,14 +10243,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ perf_event_throttle_group(event);
+ ret = 1;
}
if (event->attr.freq) {
@@ -9191,13 +10270,120 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .event = event,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+
+ return ret;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
+
+ event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
+ return 0;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static inline int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return 1;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
/*
* Generic event overflow handling, sampling.
*/
static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;
@@ -9211,6 +10397,13 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
+ if (event->attr.aux_pause)
+ perf_event_aux_pause(event->aux_event, true);
+
+ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ !bpf_overflow_handler(event, data, regs))
+ goto out;
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -9220,24 +10413,69 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
+ event->pmu->stop(event, 0);
+ }
+
+ if (event->attr.sigtrap) {
+ /*
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
+ */
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+ enum task_work_notify_mode notify_mode;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
+
+ notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
+
+ if (!event->pending_work &&
+ !task_work_add(current, &event->pending_task, notify_mode)) {
+ event->pending_work = pending_id;
+ local_inc(&event->ctx->nr_no_switch_fast);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_work; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_work != pending_id);
+ }
}
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
+out:
+ if (event->attr.aux_resume)
+ perf_event_aux_pause(event->aux_event, false);
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -9250,11 +10488,7 @@ struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int hlist_refcount;
-
- /* Recursion avoidance in each contexts */
- int recursion[PERF_NR_CONTEXTS];
};
-
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
/*
@@ -9273,16 +10507,16 @@ u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
+ old = local64_read(&hwc->period_left);
+ do {
+ val = old;
+ if (val < 0)
+ return 0;
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
return nr;
}
@@ -9342,8 +10576,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, data, regs);
}
-static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 1;
@@ -9452,17 +10685,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
int perf_swevent_get_recursion_context(void)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- return get_recursion_context(swhash->recursion);
+ return get_recursion_context(current->perf_recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- put_recursion_context(swhash->recursion, rctx);
+ put_recursion_context(current->perf_recursion, rctx);
}
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
@@ -9633,6 +10862,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}
+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -9648,7 +10880,10 @@ static int perf_swevent_init(struct perf_event *event)
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;
default:
@@ -9687,10 +10922,48 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
- struct perf_sample_data *data)
+ struct perf_raw_record *raw)
{
- void *record = data->raw->frag.data;
+ void *record = raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -9702,7 +10975,7 @@ static int perf_tp_filter_match(struct perf_event *event,
}
static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data,
+ struct perf_raw_record *raw,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
@@ -9713,7 +10986,7 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
- if (!perf_tp_filter_match(event, data))
+ if (!perf_tp_filter_match(event, raw))
return 0;
return 1;
@@ -9736,6 +11009,49 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_raw_record *raw,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, raw, regs)) {
+ perf_sample_data_init(data, 0, 0);
+ perf_sample_save_raw_data(data, event, raw);
+ perf_swevent_event(event, count, data, regs);
+ }
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_raw_record *raw,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9750,14 +11066,22 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
},
};
- perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
-
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
+ if (perf_tp_event_match(event, &raw, regs)) {
+ /*
+ * Here use the same on-stack perf_sample_data,
+ * some members in data are event-specific and
+ * need to be re-computed for different sweveents.
+ * Re-initialize data->sample_flags safely to avoid
+ * the problem that next event skips preparing data
+ * because data->sample_flags is set.
+ */
+ perf_sample_data_init(&data, 0, 0);
+ perf_sample_save_raw_data(&data, event, &raw);
perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -9766,26 +11090,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- /* Cannot deliver synchronous signal to other task. */
- if (event->attr.sigtrap)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9794,44 +11107,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -9952,7 +11227,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*
@@ -9989,95 +11264,6 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
-#ifdef CONFIG_BPF_SYSCALL
-static void bpf_overflow_handler(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct bpf_perf_event_data_kern ctx = {
- .data = data,
- .event = event,
- };
- struct bpf_prog *prog;
- int ret = 0;
-
- ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
- goto out;
- rcu_read_lock();
- prog = READ_ONCE(event->prog);
- if (prog)
- ret = bpf_prog_run(prog, &ctx);
- rcu_read_unlock();
-out:
- __this_cpu_dec(bpf_prog_active);
- if (!ret)
- return;
-
- event->orig_overflow_handler(event, data, regs);
-}
-
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- if (event->overflow_handler_context)
- /* hw breakpoint or kernel counter */
- return -EINVAL;
-
- if (event->prog)
- return -EEXIST;
-
- if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
- return -EINVAL;
-
- if (event->attr.precise_ip &&
- prog->call_get_stack &&
- (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
- event->attr.exclude_callchain_kernel ||
- event->attr.exclude_callchain_user)) {
- /*
- * On perf_event with precise_ip, calling bpf_get_stack()
- * may trigger unwinder warnings and occasional crashes.
- * bpf_get_[stack|stackid] works around this issue by using
- * callchain attached to perf_sample_data. If the
- * perf_event does not full (kernel and user) callchain
- * attached to perf_sample_data, do not allow attaching BPF
- * program that calls bpf_get_[stack|stackid].
- */
- return -EPROTO;
- }
-
- event->prog = prog;
- event->bpf_cookie = bpf_cookie;
- event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
- WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
- return 0;
-}
-
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
- struct bpf_prog *prog = event->prog;
-
- if (!prog)
- return;
-
- WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
- event->prog = NULL;
- bpf_prog_put(prog);
-}
-#else
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- return -EOPNOTSUPP;
-}
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
-}
-#endif
-
/*
* returns true if the event is a tracepoint, or a kprobe/upprobe created
* with perf_event_open()
@@ -10097,29 +11283,41 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
- u64 bpf_cookie)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- bool is_kprobe, is_tracepoint, is_syscall_tp;
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
- is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
- if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
+ return -EINVAL;
+
/* Kprobe override only works for kprobes, not uprobes. */
- if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ if (prog->kprobe_override && !is_kprobe)
+ return -EINVAL;
+
+ /* Writing to context allowed only for uprobes. */
+ if (prog->aux->kprobe_write_ctx && !is_uprobe)
return -EINVAL;
if (is_tracepoint || is_syscall_tp) {
@@ -10132,8 +11330,25 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
void perf_event_free_bpf_prog(struct perf_event *event)
{
+ if (!event->prog)
+ return;
+
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
@@ -10151,7 +11366,15 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -ENOENT;
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
u64 bpf_cookie)
{
return -ENOENT;
@@ -10232,6 +11455,17 @@ static void perf_addr_filters_splice(struct perf_event *event,
free_filters_list(&list);
}
+static void perf_free_addr_filters(struct perf_event *event)
+{
+ /*
+ * Used during free paths, there is no concurrency.
+ */
+ if (list_empty(&event->addr_filters.list))
+ return;
+
+ perf_addr_filters_splice(event, NULL);
+}
+
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
@@ -10242,8 +11476,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
@@ -10454,8 +11689,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -10497,8 +11730,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
@@ -10608,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+ if (event->state != PERF_EVENT_STATE_ACTIVE ||
+ event->hw.state & PERF_HES_STOPPED)
return HRTIMER_NORESTART;
event->pmu->read(event);
@@ -10653,11 +11890,21 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (is_sampling_event(event)) {
+ /*
+ * Careful: this function can be triggered in the hrtimer handler,
+ * for cpu-clock events, so hrtimer_cancel() would cause a
+ * deadlock.
+ *
+ * So use hrtimer_try_to_cancel() to try to stop the hrtimer,
+ * and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
+ * which guarantees that perf_swevent_hrtimer() will stop the
+ * hrtimer once it sees the PERF_HES_STOPPED flag.
+ */
+ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
- hrtimer_cancel(&hwc->hrtimer);
+ hrtimer_try_to_cancel(&hwc->hrtimer);
}
}
@@ -10668,8 +11915,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hwc->hrtimer.function = perf_swevent_hrtimer;
+ hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
@@ -10702,14 +11948,17 @@ static void cpu_clock_event_update(struct perf_event *event)
static void cpu_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, local_clock());
perf_swevent_start_hrtimer(event);
}
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
+ if (flags & PERF_EF_UPDATE)
+ cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -10723,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
- cpu_clock_event_stop(event, flags);
+ cpu_clock_event_stop(event, PERF_EF_UPDATE);
}
static void cpu_clock_event_read(struct perf_event *event)
@@ -10733,7 +11982,7 @@ static void cpu_clock_event_read(struct perf_event *event)
static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -10754,6 +12003,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -10779,14 +12029,17 @@ static void task_clock_event_update(struct perf_event *event, u64 now)
static void task_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, event->ctx->time);
perf_swevent_start_hrtimer(event);
}
static void task_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
+ if (flags & PERF_EF_UPDATE)
+ task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
@@ -10814,7 +12067,7 @@ static void task_clock_event_read(struct perf_event *event)
static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -10835,6 +12088,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -10905,38 +12159,6 @@ static int perf_event_idx_default(struct perf_event *event)
}
/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
-static void free_pmu_context(struct pmu *pmu)
-{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
-}
-
-/*
* Let userspace know that this PMU supports address range filtering:
*/
static ssize_t nr_addr_filters_show(struct device *dev,
@@ -10945,7 +12167,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10956,7 +12178,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10967,7 +12189,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -10997,12 +12219,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -11011,15 +12232,90 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
-static struct bus_type pmu_bus = {
+static const struct bus_type pmu_bus = {
.name = "event_source",
.dev_groups = pmu_dev_groups,
};
@@ -11039,29 +12335,25 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
+ pmu->dev->parent = pmu->parent;
pmu->dev->release = pmu_dev_release;
- ret = device_add(pmu->dev);
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */
- if (pmu->nr_addr_filters)
- ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
+ ret = device_add(pmu->dev);
if (ret)
- goto del_dev;
+ goto free_dev;
- if (pmu->attr_update)
+ if (pmu->attr_update) {
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
- if (ret)
- goto del_dev;
+ if (ret)
+ goto del_dev;
+ }
out:
return ret;
@@ -11071,89 +12363,111 @@ del_dev:
free_dev:
put_device(pmu->dev);
+ pmu->dev = NULL;
goto out;
}
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
{
- int cpu, ret, max = PERF_TYPE_MAX;
+ void *tmp, *val = idr_find(idr, id);
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
+ if (val != old)
+ return false;
- pmu->type = -1;
- if (!name)
- goto skip_type;
- pmu->name = name;
+ tmp = idr_replace(idr, new, id);
+ if (IS_ERR(tmp))
+ return false;
- if (type != PERF_TYPE_SOFTWARE) {
- if (type >= 0)
- max = type;
+ WARN_ON_ONCE(tmp != val);
+ return true;
+}
- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+static void perf_pmu_free(struct pmu *pmu)
+{
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
- WARN_ON(type >= 0 && ret != type);
+ if (pmu->cpu_pmu_context) {
+ int cpu;
- type = ret;
- }
- pmu->type = type;
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
- if (pmu_bus_running) {
- ret = pmu_dev_alloc(pmu);
- if (ret)
- goto free_idr;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ continue;
+ if (cpc->epc.embedded) {
+ /* refcount managed */
+ put_pmu_ctx(&cpc->epc);
+ continue;
+ }
+ kfree(cpc);
+ }
+ free_percpu(pmu->cpu_pmu_context);
}
+}
-skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
+DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
+int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
+{
+ int cpu, max = PERF_TYPE_MAX;
- hw_context_taken = 1;
- }
+ struct pmu *pmu __free(pmu_unregister) = _pmu;
+ guard(mutex)(&pmus_lock);
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
+ return -EINVAL;
- ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
- goto free_dev;
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
+ "Can not register a pmu with an invalid scope.\n"))
+ return -EINVAL;
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
+ pmu->name = name;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ if (type >= 0)
+ max = type;
- __perf_mux_hrtimer_init(cpuctx, cpu);
+ CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
+ if (pmu_type.id < 0)
+ return pmu_type.id;
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ WARN_ON(type >= 0 && pmu_type.id != type);
+
+ pmu->type = pmu_type.id;
+ atomic_set(&pmu->exclusive_cnt, 0);
+
+ if (pmu_bus_running && !pmu->dev) {
+ int ret = pmu_dev_alloc(pmu);
+ if (ret)
+ return ret;
+ }
+
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
+ if (!pmu->cpu_pmu_context)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc =
+ kmalloc_node(sizeof(struct perf_cpu_pmu_context),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
+
+ if (!cpc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11182,60 +12496,159 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
- * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
- * since these cannot be in the IDR. This way the linear search
- * is fast, provided a valid software event is provided.
+ * Now that the PMU is complete, make it visible to perf_try_init_event().
*/
- if (type == PERF_TYPE_SOFTWARE || !name)
- list_add_rcu(&pmu->entry, &pmus);
- else
- list_add_tail_rcu(&pmu->entry, &pmus);
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
+ return -EINVAL;
+ list_add_rcu(&pmu->entry, &pmus);
- atomic_set(&pmu->exclusive_cnt, 0);
- ret = 0;
-unlock:
- mutex_unlock(&pmus_lock);
+ take_idr_id(pmu_type);
+ _pmu = no_free_ptr(pmu); // let it rip
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
- return ret;
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
-free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
-free_idr:
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
-free_pdc:
- free_percpu(pmu->pmu_disable_count);
- goto unlock;
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
}
-EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
{
- mutex_lock(&pmus_lock);
- list_del_rcu(&pmu->entry);
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
+{
+ scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
+ list_del_rcu(&pmu->entry);
+ }
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
- if (pmu_bus_running) {
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
}
- free_pmu_context(pmu);
- mutex_unlock(&pmus_lock);
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
+ perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -11275,39 +12688,74 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
- if (!ret) {
- if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event))
- ret = -EOPNOTSUPP;
+ if (ret)
+ goto err_pmu;
- if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event))
- ret = -EINVAL;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
- if (ret && event->destroy)
- event->destroy(event);
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
}
- if (ret)
- module_put(pmu->module);
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
+
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ }
+
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
static struct pmu *perf_init_event(struct perf_event *event)
{
bool extended_type = false;
- int idx, type, ret;
struct pmu *pmu;
+ int type, ret;
- idx = srcu_read_lock(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
+
+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
}
/*
@@ -11326,13 +12774,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
- rcu_read_lock();
- pmu = idr_find(&pmu_idr, type);
- rcu_read_unlock();
+ scoped_guard (rcu)
+ pmu = idr_find(&pmu_idr, type);
if (pmu) {
if (event->attr.type != type && type != PERF_TYPE_RAW &&
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
+ return ERR_PTR(-ENOENT);
ret = perf_try_init_event(pmu, event);
if (ret == -ENOENT && event->attr.type != type && !extended_type) {
@@ -11341,27 +12788,21 @@ again:
}
if (ret)
- pmu = ERR_PTR(ret);
+ return ERR_PTR(ret);
- goto unlock;
+ return pmu;
}
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
+ if (ret != -ENOENT)
+ return ERR_PTR(ret);
}
-fail:
- pmu = ERR_PTR(-ENOENT);
-unlock:
- srcu_read_unlock(&pmus_srcu, idx);
- return pmu;
+ return ERR_PTR(-ENOENT);
}
static void attach_sb_event(struct perf_event *event)
@@ -11386,15 +12827,6 @@ static void account_pmu_sb_event(struct perf_event *event)
attach_sb_event(event);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
-}
-
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
@@ -11482,8 +12914,6 @@ static void account_event(struct perf_event *event)
}
enabled:
- account_event_cpu(event, event->cpu);
-
account_pmu_sb_event(event);
}
@@ -11499,7 +12929,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
void *context, int cgroup_fd)
{
struct pmu *pmu;
- struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
int node;
@@ -11514,8 +12943,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
- event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
- node);
+ struct perf_event *event __free(__free_event) =
+ kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -11537,11 +12966,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11560,8 +12991,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
@@ -11581,13 +13012,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
- if (overflow_handler == bpf_overflow_handler) {
+ if (parent_event->prog) {
struct bpf_prog *prog = parent_event->prog;
bpf_prog_inc(prog);
event->prog = prog;
- event->orig_overflow_handler =
- parent_event->orig_overflow_handler;
}
#endif
}
@@ -11609,61 +13038,78 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (is_event_in_freq_mode(event))
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
- goto err_ns;
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
+ return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
pmu = perf_init_event(event);
- if (IS_ERR(pmu)) {
- err = PTR_ERR(pmu);
- goto err_ns;
- }
+ if (IS_ERR(pmu))
+ return (void*)pmu;
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+ * The attach should be right after the perf_init_event().
+ * Otherwise, the __free_event() would mistakenly detach the non-exist
+ * perf_ctx_data because of the other errors between them.
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
- err = -EINVAL;
- goto err_pmu;
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ err = attach_perf_ctx_data(event);
+ if (err)
+ return ERR_PTR(err);
}
+ /*
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
+ return ERR_PTR(-EINVAL);
+
if (event->attr.aux_output &&
- !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
+ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+ event->attr.aux_pause || event->attr.aux_resume))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (event->attr.aux_pause && event->attr.aux_resume)
+ return ERR_PTR(-EINVAL);
+
+ if (event->attr.aux_start_paused) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return ERR_PTR(-EOPNOTSUPP);
+ event->hw.aux_paused = 1;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
sizeof(struct perf_addr_filter_range),
GFP_KERNEL);
- if (!event->addr_filter_ranges) {
- err = -ENOMEM;
- goto err_per_task;
- }
+ if (!event->addr_filter_ranges)
+ return ERR_PTR(-ENOMEM);
/*
* Clone the parent's vma offsets: they are valid until exec()
@@ -11687,44 +13133,26 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ return ERR_PTR(err);
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
err = security_perf_event_alloc(event);
if (err)
- goto err_callchain_buffer;
+ return ERR_PTR(err);
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
- return event;
-
-err_callchain_buffer:
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-err_addr_filters:
- kfree(event->addr_filter_ranges);
-
-err_per_task:
- exclusive_event_destroy(event);
-
-err_pmu:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
- if (event->hw.target)
- put_task_struct(event->hw.target);
- kmem_cache_free(perf_event_cache, event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
- return ERR_PTR(err);
+ return_ptr(event);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -11794,7 +13222,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
/* privileged levels capture (kernel, hv): check permissions */
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
- ret = perf_allow_kernel(attr);
+ ret = perf_allow_kernel();
if (ret)
return ret;
}
@@ -11853,14 +13281,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11875,7 +13314,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -11898,17 +13337,33 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
- if (atomic_read(&event->mmap_count))
+ if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!refcount_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11916,20 +13371,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
@@ -11967,37 +13415,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
-{
- struct perf_event_context *gctx;
-
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
-
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
- }
-
- return gctx;
-}
-
static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
@@ -12043,11 +13460,11 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
@@ -12060,17 +13477,17 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
- /* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
- err = perf_copy_attr(attr_uptr, &attr);
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(PERF_SECURITY_OPEN);
if (err)
return err;
if (!attr.exclude_kernel) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -12090,7 +13507,7 @@ SYSCALL_DEFINE5(perf_event_open,
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -12118,11 +13535,22 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
+ CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
- err = perf_fget_light(group_fd, &group);
- if (err)
+ if (!is_perf_file(group)) {
+ err = -EBADF;
+ goto err_fd;
+ }
+ group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
goto err_fd;
- group_leader = group.file->private_data;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -12133,7 +13561,7 @@ SYSCALL_DEFINE5(perf_event_open,
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
- goto err_group_fd;
+ goto err_fd;
}
}
@@ -12175,42 +13603,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12219,11 +13658,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12231,131 +13670,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perf_check_permission(&attr, task))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
-
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
}
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12364,36 +13748,33 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12404,9 +13785,10 @@ SYSCALL_DEFINE5(perf_event_open,
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12414,9 +13796,10 @@ SYSCALL_DEFINE5(perf_event_open,
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12433,8 +13816,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12447,39 +13828,29 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&current->perf_event_mutex);
/*
- * Drop the reference on the group_event after placing the
- * new event on the sibling_list. This ensures destruction
- * of the group leader will find the pointer to itself in
- * perf_group_detach().
+ * File reference in group guarantees that group_leader has been
+ * kept alive until we place the new event on the sibling_list.
+ * This ensures destruction of the group leader will find
+ * the pointer to itself in perf_group_detach().
*/
- fdput(group);
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ put_event(event);
err_task:
if (task)
put_task_struct(task);
-err_group_fd:
- fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -12500,17 +13871,24 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
* Grouping is not supported for kernel events, neither is 'AUX',
* make sure the caller's intentions are adjusted.
*/
- if (attr->aux_output)
+ if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
@@ -12520,14 +13898,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12537,6 +13919,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12548,13 +13937,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12563,44 +13952,67 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
- free_event(event);
+err_alloc:
+ put_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
+
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+ struct perf_event_context *old_ctx = event->ctx;
+
+ get_ctx(ctx); /* normally find_get_context() */
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(ctx, event, cpu);
/*
- * Wait for the events to quiesce before re-instating them.
+ * Now that event->ctx is updated and visible, put the old ctx.
*/
- synchronize_rcu();
+ put_ctx(old_ctx);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12610,30 +14022,54 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
+ }
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ /*
+ * Since per-cpu context is persistent, no need to grab an extra
+ * reference.
+ */
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
}
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12651,7 +14087,7 @@ static void sync_child_event(struct perf_event *child_event)
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
@@ -12664,10 +14100,12 @@ static void sync_child_event(struct perf_event *child_event)
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
+ unsigned int attach_state;
if (parent_event) {
/*
@@ -12682,28 +14120,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
+ /* PERF_ATTACH_ITRACE might be set concurrently */
+ attach_state = READ_ONCE(event->attach_state);
}
- perf_remove_from_context(event, detach_flags);
-
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
*/
if (parent_event) {
mutex_unlock(&parent_event->child_mutex);
+
/*
- * Kick perf_poll() for is_event_hup();
+ * Match the refcount initialization. Make sure it doesn't happen
+ * twice if pmu_detach_event() calls it on an already exited task.
*/
- perf_event_wakeup(parent_event);
- free_event(event);
- put_event(parent_event);
+ if (attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ * perf_pending_task() might have one too.
+ */
+ put_event(event);
+ }
+
return;
}
@@ -12713,15 +14161,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
- struct perf_event_context *child_ctx, *clone_ctx = NULL;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- WARN_ON_ONCE(child != current);
-
- child_ctx = perf_pin_task_context(child, ctxn);
- if (!child_ctx)
+ ctx = perf_pin_task_context(task);
+ if (!ctx)
return;
/*
@@ -12734,27 +14180,28 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* without ctx::mutex (it cannot because of the move_group double mutex
* lock thing). See the comments in perf_install_in_context().
*/
- mutex_lock(&child_ctx->mutex);
+ mutex_lock(&ctx->mutex);
/*
* In a single ctx::lock section, de-schedule the events and detach the
* context from the task such that we cannot ever get it scheduled back
* in.
*/
- raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ raw_spin_lock_irq(&ctx->lock);
+ if (exit)
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
- put_ctx(child_ctx); /* cannot be last */
- WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
- put_task_struct(current); /* cannot be last */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ put_ctx(ctx); /* cannot be last */
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
- clone_ctx = unclone_ctx(child_ctx);
- raw_spin_unlock_irq(&child_ctx->lock);
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irq(&ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -12764,29 +14211,48 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
+ if (exit)
+ perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(child_event, ctx, false);
- mutex_unlock(&child_ctx->mutex);
+ mutex_unlock(&ctx->mutex);
- put_ctx(child_ctx);
+ if (!exit) {
+ /*
+ * perf_event_release_kernel() could still have a reference on
+ * this context. In that case we must wait for these events to
+ * have been freed (in particular all their references to this
+ * task must've been dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount,
+ refcount_read(&ctx->refcount) == 1);
+ }
+ put_ctx(ctx);
}
/*
- * When a child task exits, feed back event values to parent events.
+ * When a task exits, feed back event values to parent events.
*
* Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
-void perf_event_exit_task(struct task_struct *child)
+void perf_event_exit_task(struct task_struct *task)
{
struct perf_event *event, *tmp;
- int ctxn;
- mutex_lock(&child->perf_event_mutex);
- list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ WARN_ON_ONCE(task != current);
+
+ mutex_lock(&task->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &task->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
@@ -12797,39 +14263,23 @@ void perf_event_exit_task(struct task_struct *child)
*/
smp_store_release(&event->owner, NULL);
}
- mutex_unlock(&child->perf_event_mutex);
+ mutex_unlock(&task->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(task, true);
/*
* The perf_event_exit_task_context calls perf_event_task
- * with child's task_ctx, which generates EXIT events for
- * child contexts and sets child->perf_event_ctxp[] to NULL.
+ * with task's task_ctx, which generates EXIT events for
+ * task contexts and sets task->perf_event_ctxp[] to NULL.
* At this point we need to send EXIT events to cpu contexts.
*/
- perf_event_task(child, NULL, 0);
-}
+ perf_event_task(task, NULL, 0);
-static void perf_free_event(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *parent = event->parent;
-
- if (WARN_ON_ONCE(!parent))
- return;
-
- mutex_lock(&parent->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent->child_mutex);
-
- put_event(parent);
-
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
- free_event(event);
+ /*
+ * Detach the perf_ctx_data for the system-wide event.
+ */
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ detach_task_ctx_data(task);
}
/*
@@ -12841,58 +14291,12 @@ static void perf_free_event(struct perf_event *event,
*/
void perf_event_free_task(struct task_struct *task)
{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
-
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
-
- mutex_unlock(&ctx->mutex);
-
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ perf_event_exit_task_context(task, false);
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -12925,6 +14329,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
return &event->attr;
}
+int perf_allow_kernel(void)
+{
+ if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+ return -EACCES;
+
+ return security_perf_event_open(PERF_SECURITY_KERNEL);
+}
+EXPORT_SYMBOL_GPL(perf_allow_kernel);
+
/*
* Inherit an event from parent task to child task.
*
@@ -12942,6 +14355,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -12954,6 +14368,14 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
@@ -12962,17 +14384,15 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return ERR_CAST(pmu_ctx);
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -12984,13 +14404,10 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
- /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -13011,7 +14428,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
@@ -13078,6 +14494,8 @@ static int inherit_group(struct perf_event *parent_event,
!perf_get_aux_event(child_ctr, leader))
return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}
@@ -13095,11 +14513,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -13109,7 +14527,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13117,16 +14535,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13136,8 +14552,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13147,14 +14562,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13177,8 +14592,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13194,8 +14608,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13203,7 +14616,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13239,18 +14652,18 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
+ child->perf_ctx_data = NULL;
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13259,22 +14672,33 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13296,34 +14720,70 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
- cpumask_clear_cpu(cpu, perf_online_mask);
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
}
#else
@@ -13332,24 +14792,57 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (cpumask_empty(perf_online_mask)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ goto end;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+end:
+ cpumask_set_cpu(cpu, perf_online_mask);
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ perf_event_setup_cpumask(cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13387,11 +14880,14 @@ void __init perf_event_init(void)
idr_init(&pmu_idr);
+ unwind_deferred_init(&perf_unwind_work,
+ perf_unwind_deferred_callback);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
@@ -13434,7 +14930,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;
ret = pmu_dev_alloc(pmu);
@@ -13486,9 +14982,11 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
- rcu_read_unlock();
+
+ preempt_disable();
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index f32320ac02fd..8ec2cb688903 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
+
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
+
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
-
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,95 +396,140 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
- if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
-}
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
-__weak int arch_reserve_bp_slot(struct perf_event *bp)
-{
- return 0;
-}
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
-__weak void arch_release_bp_slot(struct perf_event *bp)
-{
-}
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
+ }
-/*
- * Function to perform processor-specific cleanup during unregistration
- */
-__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
-{
/*
- * A weak stub function here for those archs that don't define
- * it inside arch/.../kernel/hw_breakpoint.c
+ * Readers want a stable snapshot of the per-task breakpoint list.
*/
+ assert_bp_constraints_lock_held(bp);
+
+ if (enable)
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,10 +571,9 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
- int ret;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
@@ -293,36 +587,20 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
- ret = arch_reserve_bp_slot(bp);
- if (ret)
- return ret;
-
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -331,21 +609,17 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int weight;
- arch_release_bp_slot(bp);
-
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
- arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +646,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +660,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -566,7 +849,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
- return (void __percpu __force *)ERR_PTR(-ENOMEM);
+ return ERR_PTR_PCPU(-ENOMEM);
cpus_read_lock();
for_each_online_cpu(cpu) {
@@ -585,7 +868,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
return cpu_events;
unregister_wide_hw_breakpoint(cpu_events);
- return (void __percpu __force *)ERR_PTR(err);
+ return ERR_PTR_PCPU(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -604,6 +887,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -623,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
@@ -678,38 +1006,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..2cfeeecf8de9
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ kunit_skip(test, "not enough cpus");
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ kunit_skip(test, "hw breakpoint already in use");
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 082832738c8f..d9cc57083091 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,22 +35,24 @@ struct perf_buffer {
spinlock_t event_lock;
struct list_head event_list;
- atomic_t mmap_count;
+ refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
/* AUX area */
+ struct mutex aux_mutex;
long aux_head;
unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
- atomic_t aux_mmap_count;
+ refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
int aux_in_sampling;
+ int aux_in_pause_resume;
void **aux_pages;
void *aux_priv;
@@ -116,6 +118,11 @@ static inline int page_order(struct perf_buffer *rb)
}
#endif
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
@@ -123,7 +130,7 @@ static inline unsigned long perf_data_size(struct perf_buffer *rb)
static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
- return rb->aux_nr_pages << PAGE_SHIFT;
+ return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT;
}
#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
@@ -203,7 +210,7 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-static inline int get_recursion_context(int *recursion)
+static inline int get_recursion_context(u8 *recursion)
{
unsigned char rctx = interrupt_context_level();
@@ -216,7 +223,7 @@ static inline int get_recursion_context(int *recursion)
return rctx;
}
-static inline void put_recursion_context(int *recursion, int rctx)
+static inline void put_recursion_context(u8 *recursion, unsigned char rctx)
{
barrier();
recursion[rctx]--;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 52868716ec35..20a905023736 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,10 +19,14 @@
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, EPOLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+
+ if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
+ handle->event->pending_kill = POLL_IN;
+
+ irq_work_queue(&handle->event->pending_irq);
}
/*
@@ -172,13 +176,16 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
@@ -189,9 +196,10 @@ __perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
+ offset = local_read(&rb->head);
do {
+ head = offset;
tail = READ_ONCE(rb->user_page->data_tail);
- offset = head = local_read(&rb->head);
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
@@ -215,7 +223,7 @@ __perf_output_begin(struct perf_output_handle *handle,
head += size;
else
head -= size;
- } while (local_cmpxchg(&rb->head, offset, head) != offset);
+ } while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
@@ -254,6 +262,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -329,6 +338,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
*/
if (!rb->nr_pages)
rb->paused = 1;
+
+ mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
@@ -389,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
- if (!atomic_read(&rb->aux_mmap_count))
+ if (!refcount_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))
@@ -430,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
@@ -515,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
}
@@ -606,8 +617,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
- if (order > MAX_ORDER)
- order = MAX_ORDER;
+ if (order > MAX_PAGE_ORDER)
+ order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -633,7 +644,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page);
- page->mapping = NULL;
__free_page(page);
}
@@ -669,33 +679,55 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order;
+ bool use_contiguous_pages = event->pmu->capabilities & (
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+ /*
+ * Initialize max_order to 0 for page allocation. This allocates single
+ * pages to minimize memory fragmentation. This is overridden if the
+ * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+ */
+ int max_order = 0;
+ int ret = -ENOMEM;
if (!has_aux(event))
return -EOPNOTSUPP;
+ if (nr_pages <= 0)
+ return -EINVAL;
+
if (!overwrite) {
/*
- * Watermark defaults to half the buffer, and so does the
- * max_order, to aid PMU drivers in double buffering.
+ * Watermark defaults to half the buffer, to aid PMU drivers
+ * in double buffering.
*/
if (!watermark)
- watermark = nr_pages << (PAGE_SHIFT - 1);
+ watermark = min_t(unsigned long,
+ U32_MAX,
+ (unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
- * Use aux_watermark as the basis for chunking to
- * help PMU drivers honor the watermark.
+ * If using contiguous pages, use aux_watermark as the basis
+ * for chunking to help PMU drivers honor the watermark.
*/
- max_order = get_order(watermark);
+ if (use_contiguous_pages)
+ max_order = get_order(watermark);
} else {
/*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
+ * If using contiguous pages, we need to start with the
+ * max_order that fits in nr_pages, not the other way around,
+ * hence ilog2() and not get_order.
*/
- max_order = ilog2(nr_pages);
+ if (use_contiguous_pages)
+ max_order = ilog2(nr_pages);
watermark = 0;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
@@ -798,7 +830,6 @@ static void perf_mmap_free_page(void *addr)
{
struct page *page = virt_to_page(addr);
- page->mapping = NULL;
__free_page(page);
}
@@ -811,7 +842,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
@@ -859,11 +890,6 @@ void rb_free(struct perf_buffer *rb)
}
#else
-static int data_page_nr(struct perf_buffer *rb)
-{
- return rb->nr_pages << page_order(rb);
-}
-
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
@@ -874,28 +900,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
-static void perf_mmap_unmark_page(void *addr)
-{
- struct page *page = vmalloc_to_page(addr);
-
- page->mapping = NULL;
-}
-
static void rb_free_work(struct work_struct *work)
{
struct perf_buffer *rb;
- void *base;
- int i, nr;
rb = container_of(work, struct perf_buffer, work);
- nr = data_page_nr(rb);
-
- base = rb->user_page;
- /* The '<=' counts in the user page. */
- for (i = 0; i <= nr; i++)
- perf_mmap_unmark_page(base + (i * PAGE_SIZE));
- vfree(base);
+ vfree(rb->user_page);
kfree(rb);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07..f11ceb8be8c4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -15,18 +15,21 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
-#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
+#include <linux/srcu.h>
+#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -40,7 +43,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -49,6 +53,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -58,11 +65,15 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
- struct uprobe_consumer *consumers;
+ struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
loff_t offset;
loff_t ref_ctr_offset;
- unsigned long flags;
+ unsigned long flags; /* "unsigned long" so bitops work */
/*
* The generic code assumes that it has two members of unknown type
@@ -97,11 +108,9 @@ static LIST_HEAD(delayed_uprobe_list);
*/
struct xol_area {
wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
- struct page *pages[2];
+ struct page *page;
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
@@ -110,6 +119,11 @@ struct xol_area {
unsigned long vaddr; /* Page(s) of instruction slots */
};
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
+}
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -139,85 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @old_page: the page we are replacing by new_page
- * @new_page: the modified page we replace page by
- *
- * If @new_page is NULL, only unmap @old_page.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *old_page, struct page *new_page)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
- int err;
- struct mmu_notifier_range range;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
- addr + PAGE_SIZE);
-
- if (new_page) {
- err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
- GFP_KERNEL);
- if (err)
- return err;
- }
-
- /* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(old_page);
-
- mmu_notifier_invalidate_range_start(&range);
- err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw))
- goto unlock;
- VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
-
- if (new_page) {
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
-
- if (!PageAnon(old_page)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
-
- page_remove_rmap(old_page, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
- page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
- put_page(old_page);
-
- err = 0;
- unlock:
- mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -242,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -256,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
+ int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -270,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
- if (is_swbp_insn(new_opcode)) {
+ if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@@ -356,9 +292,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -370,7 +307,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
- struct vm_area_struct *vma;
int ret;
short *ptr;
@@ -378,7 +314,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
- FOLL_WRITE, &page, &vma, NULL);
+ FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -409,7 +345,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
- "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
(unsigned long long) uprobe->ref_ctr_offset, mm);
@@ -444,6 +380,94 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ bool is_register)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, insn_vaddr, insn, nbytes);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -455,163 +479,379 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
+ bool is_register)
{
+ return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+ verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
+}
+
+int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
- int ret, is_register, ref_ctr_updated = 0;
- bool orig_page_huge = false;
+ int ret, ref_ctr_updated = 0;
unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
- is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
-retry:
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
+ * cannot deal with PMDs yet.
+ */
if (is_register)
- gup_flags |= FOLL_SPLIT_PMD;
- /* Read the page with vaddr into memory */
- ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
- &old_page, &vma, NULL);
- if (ret <= 0)
- return ret;
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
- ret = verify_opcode(old_page, vaddr, &opcode);
+retry:
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- goto put_old;
+ goto out;
+ folio = page_folio(page);
- if (WARN(!is_register && PageCompound(old_page),
- "uprobe unregister should never work on compound page\n")) {
- ret = -EINVAL;
- goto put_old;
+ ret = verify(page, insn_vaddr, insn, nbytes, data);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
}
/* We are going to replace instruction, update ref_ctr. */
- if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+ if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
- if (ret)
- goto put_old;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
ref_ctr_updated = 1;
}
ret = 0;
- if (!is_register && !PageAnon(old_page))
- goto put_old;
-
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
-
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
-
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
if (!is_register) {
- struct page *orig_page;
- pgoff_t index;
-
- VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
-
- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
- orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
- index);
-
- if (orig_page) {
- if (PageUptodate(orig_page) &&
- pages_identical(new_page, orig_page)) {
- /* let go new_page */
- put_page(new_page);
- new_page = NULL;
-
- if (PageCompound(orig_page))
- orig_page_huge = true;
- }
- put_page(orig_page);
- }
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
- if (new_page)
- put_page(new_page);
-put_old:
- put_page(old_page);
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
+ folio_walk_end(&fw, vma);
+ }
+
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
+ default:
+ break;
+ }
+out:
/* Revert back reference counter if instruction update failed. */
- if (ret && is_register && ref_ctr_updated)
- update_ref_ctr(uprobe, mm, -1);
+ if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
- if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ if (ret > 0)
+ collapse_pte_mapped_thp(mm, vaddr, false);
- return ret;
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
- *(uprobe_opcode_t *)&auprobe->insn);
+ return uprobe_write_opcode(auprobe, vma, vaddr,
+ *(uprobe_opcode_t *)&auprobe->insn, false);
}
+/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ * - caller already has refcount taken (and wants an extra one);
+ * - uprobe is RCU protected and won't be freed until after grace period;
+ * - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+ if (refcount_inc_not_zero(&uprobe->ref))
+ return uprobe;
+ return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ kfree(uprobe);
+}
+
+static void uprobe_free_srcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
+static void uprobe_free_deferred(struct work_struct *work)
+{
+ struct uprobe *uprobe = container_of(work, struct uprobe, work);
+
+ write_lock(&uprobes_treelock);
+
+ if (uprobe_is_active(uprobe)) {
+ write_seqcount_begin(&uprobes_seqcount);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
+ }
+
+ write_unlock(&uprobes_treelock);
+
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ /* start srcu -> rcu_tasks_trace -> kfree chain */
+ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
- if (refcount_dec_and_test(&uprobe->ref)) {
+ if (!refcount_dec_and_test(&uprobe->ref))
+ return;
+
+ INIT_WORK(&uprobe->work, uprobe_free_deferred);
+ schedule_work(&uprobe->work);
+}
+
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+ WARN_ON(!uprobe);
+ hprobe->state = HPROBE_LEASED;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = srcu_idx;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+ hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = -1;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
+ * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
+ * is appropriate.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
+{
+ *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
+ switch (*hstate) {
+ case HPROBE_LEASED:
+ case HPROBE_STABLE:
+ return hprobe->uprobe;
+ case HPROBE_GONE: /* uprobe is NULL, no SRCU */
+ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */
+ return NULL;
+ default:
+ WARN(1, "hprobe invalid state %d", *hstate);
+ return NULL;
+ }
+}
+
+/*
+ * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and hstate value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
+{
+ switch (hstate) {
+ case HPROBE_LEASED:
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ break;
+ case HPROBE_STABLE:
+ put_uprobe(hprobe->uprobe);
+ break;
+ case HPROBE_GONE:
+ case HPROBE_CONSUMED:
+ break;
+ default:
+ WARN(1, "hprobe invalid state %d", hstate);
+ break;
+ }
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
+ * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
+ * an extra refcount for caller to assume and use. Otherwise, it's not
+ * guaranteed that returned uprobe has a positive refcount, so caller has to
+ * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
+ * SRCU lock region. See dup_utask().
+ */
+static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
+{
+ enum hprobe_state hstate;
+
+ /*
+ * Caller should guarantee that return_instance is not going to be
+ * freed from under us. This can be achieved either through holding
+ * rcu_read_lock() or by owning return_instance in the first place.
+ *
+ * Underlying uprobe is itself protected from reuse by SRCU, so ensure
+ * SRCU lock is held properly.
+ */
+ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
+
+ hstate = READ_ONCE(hprobe->state);
+ switch (hstate) {
+ case HPROBE_STABLE:
+ /* uprobe has positive refcount, bump refcount, if necessary */
+ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
+ case HPROBE_GONE:
/*
- * If application munmap(exec_vma) before uprobe_unregister()
- * gets called, we don't get a chance to remove uprobe from
- * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ * SRCU was unlocked earlier and we didn't manage to take
+ * uprobe refcnt, so it's effectively NULL
*/
- mutex_lock(&delayed_uprobe_lock);
- delayed_uprobe_remove(uprobe, NULL);
- mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ return NULL;
+ case HPROBE_CONSUMED:
+ /*
+ * uprobe was consumed, so it's effectively NULL as far as
+ * uretprobe processing logic is concerned
+ */
+ return NULL;
+ case HPROBE_LEASED: {
+ struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
+ /*
+ * Try to switch hprobe state, guarding against
+ * hprobe_consume() or another hprobe_expire() racing with us.
+ * Note, if we failed to get uprobe refcount, we use special
+ * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
+ * be used as it will be freed after SRCU is unlocked.
+ */
+ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
+ /* We won the race, we are the ones to unlock SRCU */
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ return get ? get_uprobe(uprobe) : uprobe;
+ }
+
+ /*
+ * We lost the race, undo refcount bump (if it ever happened),
+ * unless caller would like an extra refcount anyways.
+ */
+ if (uprobe && !get)
+ put_uprobe(uprobe);
+ /*
+ * Even if hprobe_consume() or another hprobe_expire() wins
+ * the state update race and unlocks SRCU from under us, we
+ * still have a guarantee that underyling uprobe won't be
+ * freed due to ongoing caller's SRCU lock region, so we can
+ * return it regardless. Also, if `get` was true, we also have
+ * an extra ref for the caller to own. This is used in dup_utask().
+ */
+ return uprobe;
+ }
+ default:
+ WARN(1, "unknown hprobe state %d", hstate);
+ return NULL;
}
}
@@ -654,63 +894,87 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+/*
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
+ */
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
- struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
+ struct rb_node *node;
+ unsigned int seq;
- if (node)
- return get_uprobe(__node_2_uprobe(node));
+ lockdep_assert(rcu_read_lock_trace_held());
+
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
+ /*
+ * Lockless RB-tree lookups can result only in false negatives.
+ * If the element is found, it is correct and can be returned
+ * under RCU protection. If we find nothing, we need to
+ * validate that seqcount didn't change. If it did, we have to
+ * try again as we might have missed the element (false
+ * negative). If seqcount is unchanged, search truly failed.
+ */
+ if (node)
+ return __node_2_uprobe(node);
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL;
}
/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
+ * Attempt to insert a new uprobe into uprobes_tree.
+ *
+ * If uprobe already exists (for given inode+offset), we just increment
+ * refcount of previously existing uprobe.
+ *
+ * If not, a provided new instance of uprobe is inserted into the tree (with
+ * assumed initial refcount == 1).
+ *
+ * In any case, we return a uprobe instance that ends up being in uprobes_tree.
+ * Caller has to clean up new uprobe instance, if it ended up not being
+ * inserted into the tree.
+ *
+ * We assume that uprobes_treelock is held for writing.
*/
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
- struct uprobe *uprobe;
-
- spin_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- spin_unlock(&uprobes_treelock);
-
- return uprobe;
-}
-
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
+again:
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node) {
+ struct uprobe *u = __node_2_uprobe(node);
+
+ if (!try_get_uprobe(u)) {
+ rb_erase(node, &uprobes_tree);
+ RB_CLEAR_NODE(&u->rb_node);
+ goto again;
+ }
- node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
- if (node)
- return get_uprobe(__node_2_uprobe(node));
+ return u;
+ }
- /* get access + creation ref */
- refcount_set(&uprobe->ref, 2);
- return NULL;
+ return uprobe;
}
/*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- * increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- * get a double refcount (access + creation) and return NULL.
+ * Acquire uprobes_treelock and insert uprobe into uprobes_tree
+ * (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
- spin_lock(&uprobes_treelock);
+ write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
- spin_unlock(&uprobes_treelock);
+ write_seqcount_end(&uprobes_seqcount);
+ write_unlock(&uprobes_treelock);
return u;
}
@@ -732,18 +996,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
- return NULL;
+ return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
+ INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
+ RB_CLEAR_NODE(&uprobe->rb_node);
+ refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
- if (cur_uprobe) {
+ if (cur_uprobe != uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
@@ -759,33 +1026,23 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
+ static atomic64_t id;
+
down_write(&uprobe->consumer_rwsem);
- uc->next = uprobe->consumers;
- uprobe->consumers = uc;
+ list_add_rcu(&uc->cons_node, &uprobe->consumers);
+ uc->id = (__u64) atomic64_inc_return(&id);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
+ * Should never be called with consumer that's not part of @uprobe->consumers.
*/
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- struct uprobe_consumer **con;
- bool ret = false;
-
down_write(&uprobe->consumer_rwsem);
- for (con = &uprobe->consumers; *con; con = &(*con)->next) {
- if (*con == uc) {
- *con = uc->next;
- ret = true;
- break;
- }
- }
+ list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
-
- return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -794,17 +1051,17 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
- copy_from_page(page, offset, insn, nbytes);
+ uprobe_copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
@@ -870,21 +1127,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
-static inline bool consumer_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
- return !uc->filter || uc->filter(uc, ctx, mm);
+ return !uc->filter || uc->filter(uc, mm);
}
-static bool filter_chain(struct uprobe *uprobe,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- ret = consumer_filter(uc, ctx, mm);
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -893,10 +1148,10 @@ static bool filter_chain(struct uprobe *uprobe,
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -908,45 +1163,26 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
* the task can hit this breakpoint right after __replace_page().
*/
- first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
if (first_uprobe)
- set_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_set(MMF_HAS_UPROBES, mm);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
- clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_RECALC_UPROBES, mm);
else if (first_uprobe)
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
-}
-
-static inline bool uprobe_is_active(struct uprobe *uprobe)
-{
- return !RB_EMPTY_NODE(&uprobe->rb_node);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
-{
- if (WARN_ON(!uprobe_is_active(uprobe)))
- return;
+ struct mm_struct *mm = vma->vm_mm;
- spin_lock(&uprobes_treelock);
- rb_erase(&uprobe->rb_node, &uprobes_tree);
- spin_unlock(&uprobes_treelock);
- RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- put_uprobe(uprobe);
+ mm_flags_set(MMF_RECALC_UPROBES, mm);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -984,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
- GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@@ -1053,8 +1289,17 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
-
+ /*
+ * We take mmap_lock for writing to avoid the race with
+ * find_active_uprobe_rcu() which takes mmap_lock for reading.
+ * Thus this install_breakpoint() can not make
+ * is_trap_at_addr() true right after find_uprobe_rcu()
+ * returns NULL in find_active_uprobe_rcu().
+ */
mmap_write_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
+
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -1066,13 +1311,11 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
- if (consumer_filter(new,
- UPROBE_FILTER_REGISTER, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
- if (!filter_chain(uprobe,
- UPROBE_FILTER_UNREGISTER, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ if (consumer_filter(new, mm))
+ err = install_breakpoint(uprobe, vma, info->vaddr);
+ } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
+ if (!filter_chain(uprobe, mm))
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
@@ -1086,162 +1329,152 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static void
-__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+/**
+ * uprobe_unregister_nosync - unregister an already registered probe.
+ * @uprobe: uprobe to remove
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
- if (WARN_ON(!consumer_del(uprobe, uc)))
- return;
-
+ down_write(&uprobe->register_rwsem);
+ consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
- /* TODO : cant unregister? schedule a worker thread */
- if (!uprobe->consumers && !err)
- delete_uprobe(uprobe);
-}
-
-/*
- * uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- struct uprobe *uprobe;
+ up_write(&uprobe->register_rwsem);
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
+ /* TODO : cant unregister? schedule a worker thread */
+ if (unlikely(err)) {
+ uprobe_warn(current, "unregister, leaking uprobe");
return;
+ }
- down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
-/*
- * __uprobe_register - register a probe
+void uprobe_unregister_sync(void)
+{
+ /*
+ * Now that handler_chain() and handle_uretprobe_chain() iterate over
+ * uprobe->consumers list under RCU protection without holding
+ * uprobe->register_rwsem, we need to wait for RCU grace period to
+ * make sure that we can't call into just unregistered
+ * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+ * unlucky enough caller can free consumer's memory and cause
+ * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+ */
+ synchronize_rcu_tasks_trace();
+ synchronize_srcu(&uretprobes_srcu);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
+
+/**
+ * uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
-static int __uprobe_register(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+ loff_t offset, loff_t ref_ctr_offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
- return -EIO;
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
+ return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
- * This ensures that copy_from_page(), copy_to_page() and
+ * This ensures that uprobe_copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
- if (!uprobe)
- return -ENOMEM;
if (IS_ERR(uprobe))
- return PTR_ERR(uprobe);
+ return uprobe;
- /*
- * We can race with uprobe_unregister()->delete_uprobe().
- * Check uprobe_is_active() and retry if it is false.
- */
down_write(&uprobe->register_rwsem);
- ret = -EAGAIN;
- if (likely(uprobe_is_active(uprobe))) {
- consumer_add(uprobe, uc);
- ret = register_for_each_vma(uprobe, uc);
- if (ret)
- __uprobe_unregister(uprobe, uc);
- }
+ consumer_add(uprobe, uc);
+ ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN))
- goto retry;
- return ret;
-}
+ if (ret) {
+ uprobe_unregister_nosync(uprobe, uc);
+ /*
+ * Registration might have partially succeeded, so we can have
+ * this consumer being called right at this time. We need to
+ * sync here. It's ok, it's unlikely slow path.
+ */
+ uprobe_unregister_sync();
+ return ERR_PTR(ret);
+ }
-int uprobe_register(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, 0, uc);
+ return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
-/*
- * uprobe_apply - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
*/
-int uprobe_apply(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
- struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return ret;
-
down_write(&uprobe->register_rwsem);
- for (con = uprobe->consumers; con && con != uc ; con = con->next)
- ;
- if (con)
- ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ if (con == uc) {
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ break;
+ }
+ }
+ rcu_read_unlock_trace();
+
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
return ret;
}
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
- mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1255,9 +1488,9 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
- mmap_read_unlock(mm);
+ mmap_write_unlock(mm);
return err;
}
@@ -1303,25 +1536,27 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
}
- spin_unlock(&uprobes_treelock);
+ read_unlock(&uprobes_treelock);
}
/* @vma contains reference counter, not the probed instruction. */
@@ -1354,7 +1589,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1370,7 +1605,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (vma->vm_file &&
(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
- test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+ mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm))
delayed_ref_ctr_inc(vma);
if (!valid_vma(vma, true))
@@ -1389,9 +1624,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
- filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
@@ -1412,9 +1647,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
- spin_unlock(&uprobes_treelock);
+ read_unlock(&uprobes_treelock);
return !!n;
}
@@ -1430,14 +1665,35 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
- test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) ||
+ mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm))
return;
if (vma_has_uprobes(vma, start, end))
- set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->page;
+ get_page(vmf->page);
+ return 0;
+}
+
+static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+ .mremap = xol_mremap,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1463,8 +1719,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
+ VM_SEALED_SYSMAP,
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1479,13 +1736,22 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
+void * __weak arch_uretprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ unsigned long insns_size;
struct xol_area *area;
+ void *insns;
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1494,25 +1760,21 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
- area->pages[0] = alloc_page(GFP_HIGHUSER);
- if (!area->pages[0])
+ area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+ if (!area->page)
goto free_bitmap;
- area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- atomic_set(&area->slot_count, 1);
- arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ insns = arch_uretprobe_trampoline(&insns_size);
+ arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->pages[0]);
+ __free_page(area->page);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1540,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1551,10 +1821,12 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
+ arch_uprobe_clear_state(mm);
+
if (!area)
return;
- put_page(area->pages[0]);
+ put_page(area->page);
kfree(area->bitmap);
kfree(area);
}
@@ -1571,99 +1843,64 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
- set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) {
+ mm_flags_set(MMF_HAS_UPROBES, newmm);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
- set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, newmm);
}
}
-/*
- * - search for a free slot.
- */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
+static unsigned long xol_get_slot_nr(struct xol_area *area)
{
- unsigned long slot_addr;
- int slot_nr;
+ unsigned long slot_nr;
- do {
- slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
- if (slot_nr < UINSNS_PER_PAGE) {
- if (!test_and_set_bit(slot_nr, area->bitmap))
- break;
-
- slot_nr = UINSNS_PER_PAGE;
- continue;
- }
- wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
- } while (slot_nr >= UINSNS_PER_PAGE);
-
- slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
- atomic_inc(&area->slot_count);
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ return slot_nr;
+ }
- return slot_addr;
+ return UINSNS_PER_PAGE;
}
/*
* xol_get_insn_slot - allocate a slot for xol.
- * Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long xol_vaddr;
+ struct xol_area *area = get_xol_area();
+ unsigned long slot_nr;
- area = get_xol_area();
if (!area)
- return 0;
+ return false;
- xol_vaddr = xol_take_insn_slot(area);
- if (unlikely(!xol_vaddr))
- return 0;
+ wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
- arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+ utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
+ arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-
- return xol_vaddr;
+ return true;
}
/*
- * xol_free_insn_slot - If slot was earlier allocated by
- * @xol_get_insn_slot(), make the slot available for
- * subsequent requests.
+ * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
*/
-static void xol_free_insn_slot(struct task_struct *tsk)
+static void xol_free_insn_slot(struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long vma_end;
- unsigned long slot_addr;
+ struct xol_area *area = current->mm->uprobes_state.xol_area;
+ unsigned long offset = utask->xol_vaddr - area->vaddr;
+ unsigned int slot_nr;
- if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ utask->xol_vaddr = 0;
+ /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
+ if (WARN_ON_ONCE(offset >= PAGE_SIZE))
return;
- slot_addr = tsk->utask->xol_vaddr;
- if (unlikely(!slot_addr))
- return;
-
- area = tsk->mm->uprobes_state.xol_area;
- vma_end = area->vaddr + PAGE_SIZE;
- if (area->vaddr <= slot_addr && slot_addr < vma_end) {
- unsigned long offset;
- int slot_nr;
-
- offset = slot_addr - area->vaddr;
- slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
- if (slot_nr >= UINSNS_PER_PAGE)
- return;
-
- clear_bit(slot_nr, area->bitmap);
- atomic_dec(&area->slot_count);
- smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
- if (waitqueue_active(&area->wq))
- wake_up(&area->wq);
-
- tsk->utask->xol_vaddr = 0;
- }
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ clear_bit(slot_nr, area->bitmap);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
}
void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
@@ -1702,12 +1939,60 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
-static struct return_instance *free_ret_instance(struct return_instance *ri)
+static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
{
- struct return_instance *next = ri->next;
- put_uprobe(ri->uprobe);
- kfree(ri);
- return next;
+ ri->cons_cnt = 0;
+ ri->next = utask->ri_pool;
+ utask->ri_pool = ri;
+}
+
+static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
+{
+ struct return_instance *ri = utask->ri_pool;
+
+ if (likely(ri))
+ utask->ri_pool = ri->next;
+
+ return ri;
+}
+
+static void ri_free(struct return_instance *ri)
+{
+ kfree(ri->extra_consumers);
+ kfree_rcu(ri, rcu);
+}
+
+static void free_ret_instance(struct uprobe_task *utask,
+ struct return_instance *ri, bool cleanup_hprobe)
+{
+ unsigned seq;
+
+ if (cleanup_hprobe) {
+ enum hprobe_state hstate;
+
+ (void)hprobe_consume(&ri->hprobe, &hstate);
+ hprobe_finalize(&ri->hprobe, hstate);
+ }
+
+ /*
+ * At this point return_instance is unlinked from utask's
+ * return_instances list and this has become visible to ri_timer().
+ * If seqcount now indicates that ri_timer's return instance
+ * processing loop isn't active, we can return ri into the pool of
+ * to-be-reused return instances for future uretprobes. If ri_timer()
+ * happens to be running right now, though, we fallback to safety and
+ * just perform RCU-delated freeing of ri.
+ * Admittedly, this is a rather simple use of seqcount, but it nicely
+ * abstracts away all the necessary memory barriers, so we use
+ * a well-supported kernel primitive here.
+ */
+ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
+ /* immediate reuse of ri without RCU GP is OK */
+ ri_pool_push(utask, ri);
+ } else {
+ /* we might be racing with ri_timer(), so play it safe */
+ ri_free(ri);
+ }
}
/*
@@ -1717,21 +2002,77 @@ static struct return_instance *free_ret_instance(struct return_instance *ri)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri;
+ struct return_instance *ri, *ri_next;
if (!utask)
return;
- if (utask->active_uprobe)
- put_uprobe(utask->active_uprobe);
+ t->utask = NULL;
+ WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
+
+ timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
- while (ri)
- ri = free_ret_instance(ri);
+ while (ri) {
+ ri_next = ri->next;
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
+ }
+
+ /* free_ret_instance() above might add to ri_pool, so this loop should come last */
+ ri = utask->ri_pool;
+ while (ri) {
+ ri_next = ri->next;
+ ri_free(ri);
+ ri = ri_next;
+ }
- xol_free_insn_slot(t);
kfree(utask);
- t->utask = NULL;
+}
+
+#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
+
+#define for_each_ret_instance_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+ struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+ struct return_instance *ri;
+
+ /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+ guard(srcu)(&uretprobes_srcu);
+ /* RCU protects return_instance from freeing. */
+ guard(rcu)();
+
+ /*
+ * See free_ret_instance() for notes on seqcount use.
+ * We also employ raw API variants to avoid lockdep false-positive
+ * warning complaining about enabled preemption. The timer can only be
+ * invoked once for a uprobe_task. Therefore there can only be one
+ * writer. The reader does not require an even sequence count to make
+ * progress, so it is OK to remain preemptible on PREEMPT_RT.
+ */
+ raw_write_seqcount_begin(&utask->ri_seqcount);
+
+ for_each_ret_instance_rcu(ri, utask->return_instances)
+ hprobe_expire(&ri->hprobe, false);
+
+ raw_write_seqcount_end(&utask->ri_seqcount);
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+ struct uprobe_task *utask;
+
+ utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+ if (!utask)
+ return NULL;
+
+ timer_setup(&utask->ri_timer, ri_timer, 0);
+ seqcount_init(&utask->ri_seqcount);
+
+ return utask;
}
/*
@@ -1745,44 +2086,87 @@ void uprobe_free_utask(struct task_struct *t)
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
- current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ current->utask = alloc_utask();
return current->utask;
}
+static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
+{
+ struct return_instance *ri;
+
+ ri = ri_pool_pop(utask);
+ if (ri)
+ return ri;
+
+ ri = kzalloc(sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return ZERO_SIZE_PTR;
+
+ return ri;
+}
+
+static struct return_instance *dup_return_instance(struct return_instance *old)
+{
+ struct return_instance *ri;
+
+ ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ if (unlikely(old->cons_cnt > 1)) {
+ ri->extra_consumers = kmemdup(old->extra_consumers,
+ sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
+ GFP_KERNEL);
+ if (!ri->extra_consumers) {
+ kfree(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
+ struct uprobe *uprobe;
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ n_utask = alloc_utask();
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
+ /* protect uprobes from freeing, we'll need try_get_uprobe() them */
+ guard(srcu)(&uretprobes_srcu);
+
p = &n_utask->return_instances;
for (o = o_utask->return_instances; o; o = o->next) {
- n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ n = dup_return_instance(o);
if (!n)
return -ENOMEM;
- *n = *o;
- get_uprobe(n->uprobe);
- n->next = NULL;
+ /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
+ uprobe = hprobe_expire(&o->hprobe, true);
- *p = n;
+ /*
+ * New utask will have stable properly refcounted uprobe or
+ * NULL. Even if we failed to get refcounted uprobe, we still
+ * need to preserve full set of return_instances for proper
+ * uretprobe handling and nesting in forked task.
+ */
+ hprobe_init_stable(&n->hprobe, uprobe);
+
+ n->next = NULL;
+ rcu_assign_pointer(*p, n);
p = &n->next;
+
n_utask->depth++;
}
return 0;
}
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
- pr_warn("uprobe: %s:%d failed to %s\n",
- current->comm, current->pid, msg);
-}
-
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
@@ -1796,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work)
/*
* Called in context of a new clone/fork from copy_process.
*/
-void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+void uprobe_copy_process(struct task_struct *t, u64 flags)
{
struct uprobe_task *utask = current->utask;
struct mm_struct *mm = current->mm;
@@ -1832,10 +2216,10 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*
* Returns -1 in case the xol_area is not allocated.
*/
-static unsigned long get_trampoline_vaddr(void)
+unsigned long uprobe_get_trampoline_vaddr(void)
{
+ unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
struct xol_area *area;
- unsigned long trampoline_vaddr = -1;
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
@@ -1848,45 +2232,41 @@ static unsigned long get_trampoline_vaddr(void)
static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs)
{
- struct return_instance *ri = utask->return_instances;
+ struct return_instance *ri = utask->return_instances, *ri_next;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
- ri = free_ret_instance(ri);
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
+
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
}
- utask->return_instances = ri;
}
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+ struct return_instance *ri)
{
- struct return_instance *ri;
- struct uprobe_task *utask;
+ struct uprobe_task *utask = current->utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
bool chained;
+ int srcu_idx;
if (!get_xol_area())
- return;
-
- utask = get_utask();
- if (!utask)
- return;
+ goto free;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
- return;
+ goto free;
}
- ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
- if (!ri)
- return;
-
- trampoline_vaddr = get_trampoline_vaddr();
+ trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
- goto fail;
+ goto free;
/* drop the entries invalidated by longjmp() */
chained = (orig_ret_vaddr == trampoline_vaddr);
@@ -1904,54 +2284,60 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
* attack from user-space.
*/
uprobe_warn(current, "handle tail call");
- goto fail;
+ goto free;
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- ri->uprobe = get_uprobe(uprobe);
+ /* __srcu_read_lock() because SRCU lock survives switch to user space */
+ srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
+
+ hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
ri->next = utask->return_instances;
- utask->return_instances = ri;
+ rcu_assign_pointer(utask->return_instances, ri);
+
+ mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
return;
- fail:
- kfree(ri);
+free:
+ ri_free(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- struct uprobe_task *utask;
- unsigned long xol_vaddr;
+ struct uprobe_task *utask = current->utask;
int err;
- utask = get_utask();
- if (!utask)
- return -ENOMEM;
+ if (!try_get_uprobe(uprobe))
+ return -EINVAL;
- xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr)
- return -ENOMEM;
+ if (!xol_get_insn_slot(uprobe, utask)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
- utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
-
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
- xol_free_insn_slot(current);
- return err;
+ xol_free_insn_slot(utask);
+ goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
+err_out:
+ put_uprobe(uprobe);
+ return err;
}
/*
@@ -1974,9 +2360,8 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (task_sigpending(t)) {
- spin_lock_irq(&t->sighand->siglock);
+ utask->signal_denied = true;
clear_tsk_thread_flag(t, TIF_SIGPENDING);
- spin_unlock_irq(&t->sighand->siglock);
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
@@ -1989,9 +2374,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
@@ -2004,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
return;
}
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
}
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
@@ -2023,38 +2409,77 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
- /*
- * The NULL 'tsk' here ensures that any faults that occur here
- * will not be accounted to the task. 'mm' *is* current->mm,
- * but we treat this as a 'remote' access since it is
- * essentially a kernel access to the memory.
- */
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
- NULL, NULL);
+ result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+ struct file *vm_file;
+ loff_t offset;
+ unsigned int seq;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return NULL;
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ return NULL;
+
+ /*
+ * vm_file memory can be reused for another instance of struct file,
+ * but can't be freed from under us, so it's safe to read fields from
+ * it, even if the values are some garbage values; ultimately
+ * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
+ * that whatever we speculatively found is correct
+ */
+ vm_file = READ_ONCE(vma->vm_file);
+ if (!vm_file)
+ return NULL;
+
+ offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
+ uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
+ if (!uprobe)
+ return NULL;
+
+ /* now double check that nothing about MM changed */
+ if (mmap_lock_speculate_retry(mm, seq))
+ return NULL;
+
+ return uprobe;
+}
+
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+ uprobe = find_active_uprobe_speculative(bp_vaddr);
+ if (uprobe)
+ return uprobe;
+
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
- if (valid_vma(vma, false)) {
+ if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset);
+ uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -2063,57 +2488,132 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
*is_swbp = -EFAULT;
}
- if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
mmf_recalc_uprobes(mm);
mmap_read_unlock(mm);
return uprobe;
}
+static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
+{
+ struct return_consumer *ric;
+
+ if (unlikely(ri == ZERO_SIZE_PTR))
+ return ri;
+
+ if (unlikely(ri->cons_cnt > 0)) {
+ ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
+ if (!ric) {
+ ri_free(ri);
+ return ZERO_SIZE_PTR;
+ }
+ ri->extra_consumers = ric;
+ }
+
+ ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
+ ric->id = id;
+ ric->cookie = cookie;
+
+ ri->cons_cnt++;
+ return ri;
+}
+
+static struct return_consumer *
+return_consumer_find(struct return_instance *ri, int *iter, int id)
+{
+ struct return_consumer *ric;
+ int idx;
+
+ for (idx = *iter; idx < ri->cons_cnt; idx++)
+ {
+ ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
+ if (ric->id == id) {
+ *iter = idx + 1;
+ return ric;
+ }
+ }
+
+ return NULL;
+}
+
+static bool ignore_ret_handler(int rc)
+{
+ return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
+}
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- int remove = UPROBE_HANDLER_REMOVE;
- bool need_prep = false; /* prepare return uprobe, when needed */
+ bool has_consumers = false, remove = true;
+ struct return_instance *ri = NULL;
+ struct uprobe_task *utask = current->utask;
+
+ utask->auprobe = &uprobe->arch;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+ __u64 cookie = 0;
int rc = 0;
if (uc->handler) {
- rc = uc->handler(uc, regs);
- WARN(rc & ~UPROBE_HANDLER_MASK,
+ rc = uc->handler(uc, regs, &cookie);
+ WARN(rc < 0 || rc > 2,
"bad rc=0x%x from %ps()\n", rc, uc->handler);
}
- if (uc->ret_handler)
- need_prep = true;
+ remove &= rc == UPROBE_HANDLER_REMOVE;
+ has_consumers = true;
+
+ if (!uc->ret_handler || ignore_ret_handler(rc))
+ continue;
- remove &= rc;
+ if (!ri)
+ ri = alloc_return_instance(utask);
+
+ if (session)
+ ri = push_consumer(ri, uc->id, cookie);
}
+ utask->auprobe = NULL;
+
+ if (!ZERO_OR_NULL_PTR(ri))
+ prepare_uretprobe(uprobe, regs, ri);
- if (need_prep && !remove)
- prepare_uretprobe(uprobe, regs); /* put bp at return */
+ if (remove && has_consumers) {
+ down_read(&uprobe->register_rwsem);
+
+ /* re-check that removal is still required, this time under lock */
+ if (!filter_chain(uprobe, current->mm)) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
- unapply_uprobe(uprobe, current->mm);
+ up_read(&uprobe->register_rwsem);
}
- up_read(&uprobe->register_rwsem);
}
static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
{
- struct uprobe *uprobe = ri->uprobe;
+ struct return_consumer *ric;
struct uprobe_consumer *uc;
+ int ric_idx = 0;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (uc->ret_handler)
- uc->ret_handler(uc, ri->func, regs);
+ /* all consumers unsubscribed meanwhile */
+ if (unlikely(!uprobe))
+ return;
+
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+
+ if (uc->ret_handler) {
+ ric = return_consumer_find(ri, &ric_idx, uc->id);
+ if (!session || ric)
+ uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
+ }
}
- up_read(&uprobe->register_rwsem);
+ rcu_read_unlock_trace();
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2128,10 +2628,12 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
return ri;
}
-static void handle_trampoline(struct pt_regs *regs)
+void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
- struct return_instance *ri, *next;
+ struct return_instance *ri, *ri_next, *next_chain;
+ struct uprobe *uprobe;
+ enum hprobe_state hstate;
bool valid;
utask = current->utask;
@@ -2149,25 +2651,39 @@ static void handle_trampoline(struct pt_regs *regs)
* or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack().
*/
- next = find_next_ret_chain(ri);
- valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+ next_chain = find_next_ret_chain(ri);
+ valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
- if (valid)
- handle_uretprobe_chain(ri, regs);
- ri = free_ret_instance(ri);
+ /* pop current instance from the stack of pending return instances,
+ * as it's not pending anymore: we just fixed up original
+ * instruction pointer in regs and are about to call handlers;
+ * this allows fixup_uretprobe_trampoline_entries() to properly fix up
+ * captured stack traces from uretprobe handlers, in which pending
+ * trampoline addresses on the stack are replaced with correct
+ * original return addresses
+ */
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
- } while (ri != next);
+
+ uprobe = hprobe_consume(&ri->hprobe, &hstate);
+ if (valid)
+ handle_uretprobe_chain(ri, uprobe, regs);
+ hprobe_finalize(&ri->hprobe, hstate);
+
+ /* We already took care of hprobe, no need to waste more time on that. */
+ free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
+ ri = ri_next;
+ } while (ri != next_chain);
} while (!valid);
- utask->return_instances = ri;
return;
- sigill:
+sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
force_sig(SIGILL);
-
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -2181,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2192,10 +2712,12 @@ static void handle_swbp(struct pt_regs *regs)
int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr())
- return handle_trampoline(regs);
+ if (bp_vaddr == uprobe_get_trampoline_vaddr())
+ return uprobe_handle_trampoline(regs);
+
+ rcu_read_lock_trace();
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -2211,7 +2733,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
- return;
+ goto out;
}
/* change it in advance for ->handler() and restart */
@@ -2243,15 +2765,42 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
+ /*
+ * If user decided to take execution elsewhere, it makes little sense
+ * to execute the original instruction, so let's skip it.
+ */
+ if (instruction_pointer(regs) != bp_vaddr)
+ goto out;
+
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr))
- return;
+ if (pre_ssout(uprobe, regs, bp_vaddr))
+ goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
- put_uprobe(uprobe);
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+ rcu_read_unlock_trace();
+}
+
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
}
/*
@@ -2274,11 +2823,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- xol_free_insn_slot(current);
+ xol_free_insn_slot(utask);
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* see uprobe_deny_signal() */
- spin_unlock_irq(&current->sighand->siglock);
+ if (utask->signal_denied) {
+ set_thread_flag(TIF_SIGPENDING);
+ utask->signal_denied = false;
+ }
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
@@ -2319,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
if (!current->mm)
return 0;
- if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
(!current->utask || !current->utask->return_instances))
return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..8a87021211ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,7 +25,6 @@
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
@@ -49,7 +48,8 @@
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -59,37 +59,104 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
-
+#include <linux/rethook.h>
+#include <linux/sysfs.h>
+#include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
+#include <linux/pidfs.h>
+
+#include <uapi/linux/wait.h>
+
#include <asm/unistd.h>
#include <asm/mmu_context.h>
-static void __unhash_process(struct task_struct *p, bool group_dead)
+#include "exit.h"
+
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+};
+
+static __init int kernel_exit_sysctls_init(void)
{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
+{
+ struct pid *pid = task_pid(p);
+
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+
+ detach_pid(post->pids, p, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ detach_pid(post->pids, p, PIDTYPE_TGID);
+ detach_pid(post->pids, p, PIDTYPE_PGID);
+ detach_pid(post->pids, p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -122,9 +189,6 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
-
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
@@ -145,23 +209,15 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
- /*
- * Do this under ->siglock, we can race with another thread
- * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
- */
- flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
- if (group_dead) {
- flush_sigqueue(&sig->shared_pending);
+ if (group_dead)
tty_kref_put(tty);
- }
}
static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -169,6 +225,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -180,24 +237,32 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
+ struct release_task_post post;
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
+ memset(&post, 0, sizeof(post));
+
/* don't need to get the RCU readlock here - the process is dead and
- * can't be modifying its own credentials. But shut RCU-lockdep up */
- rcu_read_lock();
+ * can't be modifying its own credentials. */
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- rcu_read_unlock();
- cgroup_release(p);
+ pidfs_exit(p);
+ cgroup_task_release(p);
+
+ /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
+ thread_pid = task_pid(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -208,6 +273,9 @@ repeat:
leader = p->group_leader;
if (leader != p && thread_group_empty(leader)
&& leader->exit_state == EXIT_ZOMBIE) {
+ /* for pidfs_exit() and do_notify_parent() */
+ if (leader->signal->flags & SIGNAL_GROUP_EXIT)
+ leader->exit_code = leader->signal->group_exit_code;
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -219,10 +287,23 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
- seccomp_filter_release(p);
+ /* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
- put_pid(thread_pid);
+ exit_cred_namespaces(p);
+ add_device_randomness(&p->se.sum_exec_runtime,
+ sizeof(p->se.sum_exec_runtime));
+ free_pids(post.pids);
release_thread(p);
+ /*
+ * This task was already removed from the process/thread/pid lists
+ * and lock_task_sighand(p) can't succeed. Nobody else can touch
+ * ->pending or, if group dead, signal->shared_pending. We can call
+ * flush_sigqueue() lockless.
+ */
+ flush_sigqueue(&p->pending);
+ if (thread_group_leader(p))
+ flush_sigqueue(&p->signal->shared_pending);
+
put_task_struct_rcu_user(p);
p = leader;
@@ -340,55 +421,73 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-static void coredump_task_exit(struct task_struct *tsk)
+static void coredump_task_exit(struct task_struct *tsk,
+ struct core_state *core_state)
{
- struct core_state *core_state;
+ struct core_thread self;
+ self.task = tsk;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
/*
- * Serialize with any possible pending coredump.
- * We must hold siglock around checking core_state
- * and setting PF_POSTCOREDUMP. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group without PF_POSTCOREDUMP set.
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
*/
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->flags |= PF_POSTCOREDUMP;
- core_state = tsk->signal->core_state;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (core_state) {
- struct core_thread self;
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_IDLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
}
#ifdef CONFIG_MEMCG
+/* drops tasklist_lock if succeeds */
+static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
+{
+ bool ret = false;
+
+ task_lock(tsk);
+ if (likely(tsk->mm == mm)) {
+ /* tsk can't pass exit_mm/exec_mmap and exit */
+ read_unlock(&tasklist_lock);
+ WRITE_ONCE(mm->owner, tsk);
+ lru_gen_migrate_mm(mm);
+ ret = true;
+ }
+ task_unlock(tsk);
+ return ret;
+}
+
+static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(g, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm == mm) {
+ if (__try_to_set_owner(t, mm))
+ return true;
+ } else if (t_mm)
+ break;
+ }
+
+ return false;
+}
+
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
void mm_update_next_owner(struct mm_struct *mm)
{
- struct task_struct *c, *g, *p = current;
+ struct task_struct *g, *p = current;
-retry:
/*
* If the exiting or execing task is not the owner, it's
* someone else's problem.
@@ -409,31 +508,27 @@ retry:
/*
* Search in the children
*/
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search in the siblings
*/
- list_for_each_entry(c, &p->real_parent->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->real_parent->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
+ if (atomic_read(&mm->mm_users) <= 1)
+ break;
if (g->flags & PF_KTHREAD)
continue;
- for_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- if (c->mm)
- break;
- }
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
read_unlock(&tasklist_lock);
/*
@@ -442,29 +537,9 @@ retry:
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
WRITE_ONCE(mm->owner, NULL);
+ ret:
return;
-assign_new_owner:
- BUG_ON(c == p);
- get_task_struct(c);
- /*
- * The task_lock protects c->mm from changing.
- * We always want mm->owner->mm == mm
- */
- task_lock(c);
- /*
- * Delay read_unlock() till we have the task_lock()
- * to ensure that c does not slip away underneath us
- */
- read_unlock(&tasklist_lock);
- if (c->mm != mm) {
- task_unlock(c);
- put_task_struct(c);
- goto retry;
- }
- WRITE_ONCE(mm->owner, c);
- task_unlock(c);
- put_task_struct(c);
}
#endif /* CONFIG_MEMCG */
@@ -479,9 +554,8 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
@@ -618,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
}
/*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ * Make init inherit all the child processes
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
@@ -677,6 +746,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -688,6 +758,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
do_notify_parent(tsk, tsk->exit_signal);
} else {
autoreap = true;
+ /* untraced sub-thread */
+ do_notify_pidfd(tsk);
}
if (autoreap) {
@@ -707,6 +779,67 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_STACK_GROWSUP
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n--;
+ } while (!*n);
+
+ return (unsigned long)end_of_stack(p) - (unsigned long)n;
+}
+#else /* !CONFIG_STACK_GROWSUP */
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n++;
+ } while (!*n);
+
+ return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif /* CONFIG_STACK_GROWSUP */
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ if (used_stack <= 1024)
+ count_vm_event(KSTACK_1K);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ count_vm_event(KSTACK_2K);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ count_vm_event(KSTACK_4K);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ count_vm_event(KSTACK_8K);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ count_vm_event(KSTACK_16K);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ count_vm_event(KSTACK_32K);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ count_vm_event(KSTACK_64K);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ count_vm_event(KSTACK_REST);
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+}
+
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
@@ -714,6 +847,7 @@ static void check_stack_usage(void)
unsigned long free;
free = stack_not_used(current);
+ kstack_histogram(THREAD_SIZE - free);
if (free >= lowest_to_date)
return;
@@ -726,44 +860,60 @@ static void check_stack_usage(void)
}
spin_unlock(&low_water_lock);
}
-#else
+#else /* !CONFIG_DEBUG_STACK_USAGE */
static inline void check_stack_usage(void) {}
-#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
+
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+ struct core_state *core_state;
+
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = signal->core_state;
+ spin_unlock_irq(&sighand->siglock);
+
+ if (unlikely(core_state))
+ coredump_task_exit(tsk, core_state);
+}
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- WARN_ON(blk_needs_flush_plug(tsk));
-
- /*
- * If do_dead is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- *
- * On uptodate architectures force_uaccess_begin is a noop. On
- * architectures that still have set_fs/get_fs in addition to handling
- * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
- * default.
- */
- force_uaccess_begin();
+ WARN_ON(irqs_disabled());
+ WARN_ON(tsk->plug);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
- coredump_task_exit(tsk);
+ synchronize_group_exit(tsk, code);
ptrace_event(PTRACE_EVENT_EXIT, code);
-
- validate_creds_for_do_exit(tsk);
+ user_events_exit(tsk);
io_uring_files_cancel();
+ sched_mm_cid_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
+ seccomp_filter_release(tsk);
+
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
@@ -777,7 +927,7 @@ void __noreturn do_exit(long code)
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
@@ -789,12 +939,27 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ trace_sched_process_exit(tsk, group_dead);
+
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+ /*
+ * PF_EXITING (above) ensures unwind_deferred_request() will no
+ * longer add new unwinds. While exit_mm() (below) will destroy the
+ * abaility to do unwinds. So flush any pending unwinds here.
+ */
+ unwind_deferred_task_exit(tsk);
exit_mm();
if (group_dead)
acct_process();
- trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
@@ -802,20 +967,12 @@ void __noreturn do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
- exit_task_namespaces(tsk);
+ exit_nsproxy_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
- cgroup_exit(tsk);
+ cgroup_task_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
@@ -844,7 +1001,7 @@ void __noreturn do_exit(long code)
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
@@ -869,12 +1026,18 @@ void __noreturn make_task_dead(int signr)
* Then do everything else.
*/
struct task_struct *tsk = current;
+ unsigned int limit;
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
@@ -883,6 +1046,20 @@ void __noreturn make_task_dead(int signr)
}
/*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
@@ -906,7 +1083,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
@@ -915,7 +1092,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
- else if (!thread_group_empty(current)) {
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
@@ -948,26 +1125,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
-struct waitid_info {
- pid_t pid;
- uid_t uid;
- int status;
- int cause;
-};
-
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct waitid_info *wo_info;
- int wo_stat;
- struct rusage *wo_rusage;
-
- wait_queue_entry_t child_wait;
- int notask_error;
-};
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
@@ -1061,18 +1218,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
- * psig->stats_lock also protects us from our sub-theads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * psig->stats_lock also protects us from our sub-threads
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1095,8 +1249,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
@@ -1431,6 +1584,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
@@ -1438,13 +1602,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
-
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@ -1493,16 +1654,10 @@ static int do_wait_pid(struct wait_opts *wo)
return 0;
}
-static long do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
{
- int retval;
-
- trace_sched_process_wait(wo->wo_pid);
+ long retval;
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
@@ -1514,24 +1669,23 @@ repeat:
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
- goto end;
+ return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
- goto end;
+ return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
@@ -1541,27 +1695,44 @@ repeat:
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
+ return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
- unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@@ -1604,19 +1775,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
return -EINVAL;
}
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_rusage = ru;
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
- wo.wo_flags |= WNOHANG;
+ wo->wo_flags |= WNOHANG;
+
+ return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
+
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
ret = do_wait(&wo);
- if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
- put_pid(pid);
+ put_pid(wo.wo_pid);
return ret;
}
@@ -1797,31 +1981,14 @@ Efault:
}
#endif
-/**
- * thread_group_exited - check that a thread group has exited
- * @pid: tgid of thread group to be checked.
- *
- * Test if the thread group represented by tgid has exited (all
- * threads are zombies, dead or completely gone).
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
*
- * Return: true if the thread group has exited. false otherwise.
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
*/
-bool thread_group_exited(struct pid *pid)
-{
- struct task_struct *task;
- bool exited;
-
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
- exited = !task ||
- (READ_ONCE(task->exit_state) && thread_group_empty(task));
- rcu_read_unlock();
-
- return exited;
-}
-EXPORT_SYMBOL(thread_group_exited);
-
-__weak void abort(void)
+__weak __function_aligned void abort(void)
{
BUG();
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
+
+ wait_queue_entry_t child_wait;
+ int notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru);
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index b6f330f0fe74..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -113,7 +114,7 @@ int kernel_text_address(unsigned long addr)
/* Treat this like an NMI as it can happen anywhere */
if (no_rcu)
- rcu_nmi_enter();
+ ct_nmi_enter();
if (is_module_text_address(addr))
goto out;
@@ -126,18 +127,39 @@ int kernel_text_address(unsigned long addr)
ret = 0;
out:
if (no_rcu)
- rcu_nmi_exit();
+ ct_nmi_exit();
return ret;
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 60dc825ecc2b..d971a0189319 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)
static void fei_debugfs_remove_attr(struct fei_attr *attr)
{
- struct dentry *dir;
-
- dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- debugfs_remove_recursive(dir);
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
@@ -247,15 +244,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out_free;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +291,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
mutex_unlock(&fei_lock);
-out_free:
kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..b1f3915d5f8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,13 +16,13 @@
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -37,13 +37,14 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
+#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -53,6 +54,7 @@
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
@@ -75,7 +77,6 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -92,23 +93,36 @@
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
-
-#include <asm/pgalloc.h>
+#include <linux/stackprotector.h>
+#include <linux/user_events.h>
+#include <linux/iommu.h>
+#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
+#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
+#include <linux/pgalloc.h>
#include <linux/uaccess.h>
+
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#include <kunit/visibility.h>
+
/*
* Minimum number of threads to boot the kernel
*/
@@ -163,7 +177,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -175,15 +188,6 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
-
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
#ifdef CONFIG_VMAP_STACK
/*
@@ -192,126 +196,259 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
+
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ unsigned int i;
+
+ /*
+ * If the node has memory, we are guaranteed the stacks are backed by local pages.
+ * Otherwise the pages are arbitrary.
+ *
+ * Note that depending on cpuset it is possible we will get migrated to a different
+ * node immediately after allocating here, so this does *not* guarantee locality for
+ * arbitrary callers.
+ */
+ scoped_guard(preempt) {
+ if (node != NUMA_NO_NODE && numa_node_id() != node)
+ return NULL;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (vm_area)
+ return vm_area;
+ }
+ }
+
+ return NULL;
+}
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
+{
+ unsigned int i;
+ int nid;
+
+ /*
+ * Don't cache stacks if any of the pages don't match the local domain, unless
+ * there is no local memory to begin with.
+ *
+ * Note that lack of local memory does not automatically mean it makes no difference
+ * performance-wise which other domain backs the stack. In this case we are merely
+ * trying to avoid constantly going to vmalloc.
+ */
+ scoped_guard(preempt) {
+ nid = numa_node_id();
+ if (node_state(nid, N_MEMORY)) {
+ for (i = 0; i < vm_area->nr_pages; i++) {
+ struct page *page = vm_area->pages[i];
+ if (page_to_nid(page) != nid)
+ return false;
+ }
+ }
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *tmp = NULL;
+
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+ return true;
+ }
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+ struct vm_struct *vm_area = vm_stack->stack_vm_area;
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_area->addr);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
static int free_vm_stack_cache(unsigned int cpu)
{
- struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+ struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *vm_stack = cached_vm_stacks[i];
+ struct vm_struct *vm_area = cached_vm_stack_areas[i];
- if (!vm_stack)
+ if (!vm_area)
continue;
- vfree(vm_stack->addr);
- cached_vm_stacks[i] = NULL;
+ vfree(vm_area->addr);
+ cached_vm_stack_areas[i] = NULL;
}
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
-#ifdef CONFIG_VMAP_STACK
- void *stack;
int i;
+ int ret;
+ int nr_charged = 0;
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s;
+ BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
- s = this_cpu_xchg(cached_stacks[i], NULL);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ nr_charged++;
+ }
+ return 0;
+err:
+ for (i = 0; i < nr_charged; i++)
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
+ return ret;
+}
- if (!s)
- continue;
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ void *stack;
+
+ vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+ if (vm_area) {
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
+ return -ENOMEM;
+ }
- /* Mark stack accessible for KASAN. */
- kasan_unpoison_range(s->addr, THREAD_SIZE);
+ /* Reset stack metadata. */
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
+
+ stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
- tsk->stack_vm_area = s;
- tsk->stack = s->addr;
- return s->addr;
+ tsk->stack_vm_area = vm_area;
+ tsk->stack = stack;
+ return 0;
}
- /*
- * Allocated stacks are cached and later reused by new threads,
- * so memcg accounting is performed manually on assigning/releasing
- * stacks to tasks. Drop __GFP_ACCOUNT.
- */
- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP & ~__GFP_ACCOUNT,
- PAGE_KERNEL,
- 0, node, __builtin_return_address(0));
+ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+ GFP_VMAP_STACK,
+ node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm_area = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm_area;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+#else /* !CONFIG_VMAP_STACK */
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+#if THREAD_SIZE >= PAGE_SIZE
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -321,8 +458,9 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -336,93 +474,39 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
- return vma;
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- *new = data_race(*orig);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
- dup_vma_anon_name(orig, new);
- }
- return new;
-}
-
-void vm_area_free(struct vm_area_struct *vma)
-{
- free_vma_anon_name(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm_area = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
+ void *stack = task_stack_page(tsk);
+
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm_area;
int i;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
+ vm_area = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
@@ -430,12 +514,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -448,6 +527,9 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
release_user_cpus_ptr(tsk);
scs_release(tsk);
@@ -469,11 +551,12 @@ void free_task(struct task_struct *tsk)
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
+ bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
-static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct file *exe_file;
@@ -483,160 +566,11 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
+ if (exe_file && exe_file_deny_write_access(exe_file))
+ pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
- struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
- int retval;
- unsigned long charge;
- LIST_HEAD(uf);
-
- uprobe_start_dup_mmap();
- if (mmap_write_lock_killable(oldmm)) {
- retval = -EINTR;
- goto fail_uprobe_end;
- }
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- dup_mm_exe_file(mm, oldmm);
-
- mm->total_vm = oldmm->total_vm;
- mm->data_vm = oldmm->data_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- retval = khugepaged_fork(mm, oldmm);
- if (retval)
- goto out;
-
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
- struct file *file;
-
- if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- /*
- * Don't duplicate many vmas if we've been oom-killed (for
- * example)
- */
- if (fatal_signal_pending(current)) {
- retval = -EINTR;
- goto out;
- }
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = vm_area_dup(mpnt);
- if (!tmp)
- goto fail_nomem;
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- retval = dup_userfaultfd(tmp, &uf);
- if (retval)
- goto fail_nomem_anon_vma_fork;
- if (tmp->vm_flags & VM_WIPEONFORK) {
- /*
- * VM_WIPEONFORK gets a clean slate in the child.
- * Don't prepare anon_vma until fault since we don't
- * copy page for current vma.
- */
- tmp->anon_vma = NULL;
- } else if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
- file = tmp->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- /*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
- */
- if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
-
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
-
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
-
- mm->map_count++;
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(tmp, mpnt);
-
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
- if (retval)
- goto out;
- }
- /* a new mm has just been created */
- retval = arch_dup_mmap(oldmm, mm);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm(oldmm);
- mmap_write_unlock(oldmm);
- dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
- uprobe_end_dup_mmap();
- return retval;
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- vm_area_free(tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto out;
-}
-
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
@@ -650,17 +584,40 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- mmap_write_lock(oldmm);
- dup_mm_exe_file(mm, oldmm);
- mmap_write_unlock(oldmm);
- return 0;
-}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
+#ifdef CONFIG_MM_ID
+static DEFINE_IDA(mm_ida);
+
+static inline int mm_alloc_id(struct mm_struct *mm)
+{
+ int ret;
+
+ ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+ mm->mm_id = ret;
+ return 0;
+}
+
+static inline void mm_free_id(struct mm_struct *mm)
+{
+ const mm_id_t id = mm->mm_id;
+
+ mm->mm_id = MM_ID_DUMMY;
+ if (id == MM_ID_DUMMY)
+ return;
+ if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
+ return;
+ ida_free(&mm_ida, id);
+}
+#else /* !CONFIG_MM_ID */
+static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
+static inline void mm_free_id(struct mm_struct *mm) {}
+#endif /* CONFIG_MM_ID */
+
static void check_mm(struct mm_struct *mm)
{
int i;
@@ -669,18 +626,21 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
}
if (mm_pgtables_bytes(mm))
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
mm_pgtables_bytes(mm));
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -688,6 +648,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -697,12 +718,21 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
+ mm_free_id(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -748,11 +778,11 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
io_uring_free(tsk);
- cgroup_free(tsk);
+ cgroup_task_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
- bpf_task_storage_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -761,15 +791,23 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
* set_max_threads
*/
-static void set_max_threads(unsigned int max_threads_suggested)
+static void __init set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
- unsigned long nr_pages = totalram_pages();
+ unsigned long nr_pages = memblock_estimated_nr_free_pages();
/*
* The number of threads shall be limited such that the thread
@@ -792,8 +830,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
-static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
arch_thread_struct_whitelist(offset, size);
@@ -807,12 +844,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
@@ -825,7 +860,6 @@ void __init fork_init(void)
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -837,13 +871,13 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
+ for (i = 0; i < UCOUNT_COUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -874,8 +908,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -884,32 +916,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -953,9 +971,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;
- account_kernel_stack(tsk, 1);
-
kcov_task_init(tsk);
+ kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
@@ -963,16 +980,30 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
+ tsk->reported_split_lock = 0;
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
+#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1019,26 +1050,28 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static void mm_init_pasid(struct mm_struct *mm)
+static void mm_init_uprobes_state(struct mm_struct *mm)
{
-#ifdef CONFIG_IOMMU_SUPPORT
- mm->pasid = INIT_PASID;
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
#endif
}
-static void mm_init_uprobes_state(struct mm_struct *mm)
+static void mmap_init_lock(struct mm_struct *mm)
{
-#ifdef CONFIG_UPROBES
- mm->uprobes_state.xol_area = NULL;
+ init_rwsem(&mm->mmap_lock);
+ mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ rcuwait_init(&mm->vma_writer_wait);
#endif
}
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
@@ -1054,36 +1087,61 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mm_init_pasid(mm);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
+ mm_flags_clear_all(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ unsigned long flags = __mm_flags_get_word(current->mm);
+
+ __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- mm->flags = default_dump_filter;
+ __mm_flags_overwrite_word(mm, default_dump_filter);
mm->def_flags = 0;
}
+ if (futex_mm_init(mm))
+ goto fail_mm_init;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
+ if (mm_alloc_id(mm))
+ goto fail_noid;
+
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm, p))
+ goto fail_cid;
+
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
+ mm_free_id(mm);
+fail_noid:
mm_free_pgd(mm);
fail_nopgd:
+ futex_hash_free(mm);
+fail_mm_init:
free_mm(mm);
return NULL;
}
@@ -1102,6 +1160,7 @@ struct mm_struct *mm_alloc(void)
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());
}
+EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
static inline void __mmput(struct mm_struct *mm)
{
@@ -1112,7 +1171,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
- mm_put_huge_zero_page(mm);
+ mm_put_huge_zero_folio(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
@@ -1121,6 +1180,8 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
@@ -1136,7 +1197,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
@@ -1152,16 +1213,19 @@ void mmput_async(struct mm_struct *mm)
schedule_work(&mm->async_put_work);
}
}
+EXPORT_SYMBOL_GPL(mmput_async);
#endif
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
*
* Can only fail if new_exe_file != NULL.
*/
@@ -1181,13 +1245,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
- if (unlikely(deny_write_access(new_exe_file)))
+ if (unlikely(exe_file_deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@@ -1195,10 +1259,10 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
- * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
- * dealing with concurrent invocation and without grabbing the mmap lock in
- * write mode.
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/
@@ -1211,13 +1275,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
- &old_exe_file->f_path))
+ &old_exe_file->f_path)) {
ret = -EBUSY;
+ break;
+ }
}
mmap_read_unlock(mm);
fput(old_exe_file);
@@ -1225,28 +1292,27 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- /* set the new file, lockless */
- ret = deny_write_access(new_exe_file);
+ ret = exe_file_deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
- old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
if (old_exe_file) {
- /*
- * Don't race with dup_mmap() getting the file and disallowing
- * write access while someone might open the file writable.
- */
- mmap_read_lock(mm);
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
- mmap_read_unlock(mm);
}
return 0;
}
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -1256,15 +1322,14 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1275,18 +1340,20 @@ struct file *get_task_exe_file(struct task_struct *task)
struct file *exe_file = NULL;
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (!(task->flags & PF_KTHREAD))
- exe_file = get_mm_exe_file(mm);
- }
+ if (mm)
+ exe_file = get_mm_exe_file(mm);
task_unlock(task);
return exe_file;
}
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -1298,19 +1365,29 @@ struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (task->flags & PF_KTHREAD)
- mm = NULL;
- else
- mmget(mm);
- }
+ if (mm)
+ mmget(mm);
task_unlock(task);
return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);
+static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
+{
+ if (mm == current->mm)
+ return true;
+ if (ptrace_may_access(task, mode))
+ return true;
+ if ((mode & PTRACE_MODE_READ) && perfmon_capable())
+ return true;
+ return false;
+}
+
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
struct mm_struct *mm;
@@ -1321,8 +1398,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return ERR_PTR(err);
mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ if (!mm) {
+ mm = ERR_PTR(-ESRCH);
+ } else if (!may_access_mm(mm, task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
@@ -1347,13 +1425,12 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
cgroup_enter_frozen();
- killed = wait_for_completion_killable(vfork);
+ killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
- freezer_count();
if (killed) {
task_lock(child);
@@ -1448,9 +1525,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
+ uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
+ uprobe_end_dup_mmap();
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
@@ -1465,12 +1544,14 @@ free_pt:
mm->binfmt = NULL;
mm_init_owner(mm, NULL);
mmput(mm);
+ if (err)
+ uprobe_end_dup_mmap();
fail_nomem:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
@@ -1493,9 +1574,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
@@ -1507,21 +1585,23 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return -EAGAIN;
}
fs->users++;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -1530,34 +1610,37 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(u64 clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
- int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
- goto out;
+ return 0;
+
+ if (no_files) {
+ tsk->files = NULL;
+ return 0;
+ }
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
- goto out;
+ return 0;
}
- newf = dup_fd(oldf, NR_OPEN_MAX, &error);
- if (!newf)
- goto out;
+ newf = dup_fd(oldf, NULL);
+ if (IS_ERR(newf))
+ return PTR_ERR(newf);
tsk->files = newf;
- error = 0;
-out:
- return error;
+ return 0;
}
-static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -1606,7 +1689,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
posix_cputimers_group_init(pct, cpu_limit);
}
-static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
@@ -1619,6 +1702,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
sig->nr_threads = 1;
+ sig->quick_threads = 1;
atomic_set(&sig->live, 1);
refcount_set(&sig->sigcnt, 1);
@@ -1634,9 +1718,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
- INIT_LIST_HEAD(&sig->posix_timers);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->real_timer.function = it_real_fn;
+ INIT_HLIST_HEAD(&sig->posix_timers);
+ INIT_HLIST_HEAD(&sig->ignored_posix_timers);
+ hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif
task_lock(current->group_leader);
@@ -1648,6 +1732,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1736,127 +1824,87 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
+ INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-struct pid *pidfd_pid(const struct file *file)
-{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
-
- return ERR_PTR(-EBADF);
-}
-
-static int pidfd_release(struct inode *inode, struct file *file)
-{
- struct pid *pid = file->private_data;
-
- file->private_data = NULL;
- put_pid(pid);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @ret_file: return the new pidfs file
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
*
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- * namespace (also take care to create new mount namespaces in the
- * new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- * have exactly one entry, which is 0
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
*/
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
- struct pid *pid = f->private_data;
- struct pid_namespace *ns;
- pid_t nr = -1;
-
- if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file)->i_sb);
- nr = pid_nr_ns(pid, ns);
- }
-
- seq_put_decimal_ll(m, "Pid:\t", nr);
+ struct file *pidfs_file;
-#ifdef CONFIG_PID_NS
- seq_put_decimal_ll(m, "\nNSpid:\t", nr);
- if (nr > 0) {
- int i;
+ /*
+ * PIDFD_STALE is only allowed to be passed if the caller knows
+ * that @pid is already registered in pidfs and thus
+ * PIDFD_INFO_EXIT information is guaranteed to be available.
+ */
+ if (!(flags & PIDFD_STALE)) {
+ /*
+ * While holding the pidfd waitqueue lock removing the
+ * task linkage for the thread-group leader pid
+ * (PIDTYPE_TGID) isn't possible. Thus, if there's still
+ * task linkage for PIDTYPE_PID not having thread-group
+ * leader linkage for the pid means it wasn't a
+ * thread-group leader in the first place.
+ */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
- /* If nr is non-zero it means that 'pid' is valid and that
- * ns, i.e. the pid namespace associated with the procfs
- * instance, is in the pid namespace hierarchy of pid.
- * Start at one below the already printed level.
+ /* Task has already been reaped. */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ return -ESRCH;
+ /*
+ * If this struct pid isn't used as a thread-group
+ * leader but the caller requested to create a
+ * thread-group leader pidfd then report ENOENT.
*/
- for (i = ns->level + 1; i <= pid->level; i++)
- seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
+ return -ENOENT;
}
-#endif
- seq_putc(m, '\n');
-}
-#endif
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
- struct pid *pid = file->private_data;
- __poll_t poll_flags = 0;
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
- poll_wait(file, &pid->wait_pidfd, pts);
+ pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
- /*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
- */
- if (thread_group_exited(pid))
- poll_flags = EPOLLIN | EPOLLRDNORM;
-
- return poll_flags;
+ *ret_file = pidfs_file;
+ return take_fd(pidfd);
}
-const struct file_operations pidfd_fops = {
- .release = pidfd_release,
- .poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1884,13 +1932,29 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
- set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
/* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_unlock(&oom_adj_mutex);
}
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+ memset(&p->rv, 0, sizeof(p->rv));
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
+static bool need_futex_hash_allocate_default(u64 clone_flags)
+{
+ if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
+ return false;
+ return true;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1899,7 +1963,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -1909,7 +1973,7 @@ static __latent_entropy struct task_struct *copy_process(
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
- u64 clone_flags = args->flags;
+ const u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
/*
@@ -1957,22 +2021,12 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- */
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
- * - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
@@ -1998,14 +2052,22 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
- if (args->io_thread) {
+ p->flags &= ~PF_KTHREAD;
+ if (args->kthread)
+ p->flags |= PF_KTHREAD;
+ if (args->user_worker) {
/*
- * Mark us an IO worker, and block any signal that isn't
+ * Mark us a user worker, and block any signal that isn't
* fatal or STOP
*/
- p->flags |= PF_IO_WORKER;
+ p->flags |= PF_USER_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->io_thread)
+ p->flags |= PF_IO_WORKER;
+
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
@@ -2021,18 +2083,18 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
- if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
@@ -2069,10 +2131,6 @@ static __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
@@ -2083,11 +2141,12 @@ static __latent_entropy struct task_struct *copy_process(
acct_clear_integrals(p);
posix_cputimers_init(&p->posix_cputimers);
+ tick_dep_init_task(p);
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
- if (p->flags & PF_KTHREAD) {
+ if (args->kthread) {
if (!set_kthread_struct(p))
goto bad_fork_cleanup_delayacct;
}
@@ -2101,7 +2160,6 @@ static __latent_entropy struct task_struct *copy_process(
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
- p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -2114,13 +2172,10 @@ static __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
-#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
-#endif
-#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
-#endif
+
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -2130,6 +2185,8 @@ static __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
@@ -2137,7 +2194,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
@@ -2149,7 +2206,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2170,7 +2227,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+ retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
@@ -2191,21 +2248,17 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2248,13 +2301,15 @@ static __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
@@ -2267,6 +2322,34 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ retval = sched_cgroup_fork(p, args);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+
+ /*
+ * Allocate a default futex hash for the user process once the first
+ * thread spawns.
+ */
+ if (need_futex_hash_allocate_default(clone_flags)) {
+ retval = futex_hash_allocate_default();
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+ /*
+ * If we fail beyond this point we don't free the allocated
+ * futex hash map. We assume that another thread will be created
+ * and makes use of it. The hash map will be freed once the main
+ * thread terminates.
+ */
+ }
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2303,29 +2386,29 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock(&current->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
+ rv_task_fork(p);
rseq_fork(p, clone_flags);
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2358,11 +2441,10 @@ static __latent_entropy struct task_struct *copy_process(
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
+ current->signal->quick_threads++;
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
@@ -2375,22 +2457,27 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
return p;
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
@@ -2406,9 +2493,10 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
+ sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
@@ -2429,6 +2517,8 @@ bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+ sched_cancel_fork(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
@@ -2438,9 +2528,11 @@ bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
@@ -2460,11 +2552,21 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
+static int idle_dummy(void *dummy)
+{
+ /* This function is never called */
+ return 0;
+}
+
struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
- .flags = CLONE_VM,
+ .flags = CLONE_VM,
+ .fn = &idle_dummy,
+ .fn_arg = NULL,
+ .kthread = 1,
+ .idle = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2476,11 +2578,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
-{
- return dup_mm(NULL, &init_mm);
-}
-
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@@ -2490,14 +2587,13 @@ struct mm_struct *copy_init_mm(void)
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
- CLONE_IO;
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .flags = flags,
+ .fn = fn,
+ .fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2529,8 +2625,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
- if ((args->flags & CLONE_PIDFD) &&
- (args->flags & CLONE_PARENT_SETTID) &&
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
@@ -2576,6 +2672,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
@@ -2594,14 +2697,31 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ .name = name,
+ .kthread = 1,
+ };
+
+ return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
};
return kernel_clone(&args);
@@ -2673,9 +2793,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-#ifdef __ARCH_WANT_SYS_CLONE3
-
-noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
{
@@ -2764,7 +2882,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
@@ -2783,7 +2901,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
* - make the CLONE_DETACHED bit reusable for clone3
* - make the CSIGNAL bits reusable for clone3
*/
- if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
@@ -2801,7 +2919,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
@@ -2818,6 +2936,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
+#ifdef __ARCH_BROKEN_SYS_CLONE3
+#warning clone3() entry point is missing, please fix
+ return -ENOSYS;
+#endif
+
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
@@ -2829,7 +2952,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
return kernel_clone(&kargs);
}
-#endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
@@ -2875,10 +2997,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2895,21 +3034,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
-
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
@@ -2971,17 +3095,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
/*
* Unshare file descriptor table if it is being shared
*/
-int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
- struct files_struct **new_fdp)
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
- int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, max_fds, &error);
- if (!*new_fdp)
- return error;
+ fd = dup_fd(fd, NULL);
+ if (IS_ERR(fd))
+ return PTR_ERR(fd);
+ *new_fdp = fd;
}
return 0;
@@ -3039,7 +3162,7 @@ int ksys_unshare(unsigned long unshare_flags)
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
+ err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
@@ -3076,13 +3199,13 @@ int ksys_unshare(unsigned long unshare_flags)
if (new_fs) {
fs = current->fs;
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
}
if (new_fd)
@@ -3131,7 +3254,7 @@ int unshare_files(void)
struct files_struct *old, *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+ error = unshare_fd(CLONE_FILES, &copy);
if (error || !copy)
return error;
@@ -3143,7 +3266,7 @@ int unshare_files(void)
return 0;
}
-int sysctl_max_threads(struct ctl_table *table, int write,
+static int sysctl_max_threads(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -3165,3 +3288,21 @@ int sysctl_max_threads(struct ctl_table *table, int write,
return 0;
}
+
+static const struct ctl_table fork_sysctl_table[] = {
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+};
+
+static int __init init_fork_sysctl(void)
+{
+ register_sysctl_init("kernel", fork_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_fork_sysctl);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 45ab36ffd0e7..a76bf957fb32 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -10,13 +10,15 @@
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
+#include <linux/oom.h>
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
* system_transition_mutex
*/
bool pm_freezing;
@@ -29,7 +31,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -39,10 +41,10 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ if (tsk_is_oom_victim(p))
return false;
- if (pm_nosig_freezing || cgroup_freezing(p))
+ if (pm_nosig_freezing || cgroup1_freezing(p))
return true;
if (pm_freezing && !(p->flags & PF_KTHREAD))
@@ -52,41 +54,44 @@ bool freezing_slow_path(struct task_struct *p)
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- unsigned int save = get_current_state();
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ raw_spin_lock_irq(&current->pi_lock);
+ WRITE_ONCE(current->__state, TASK_FROZEN);
+ /* unstale saved_state so that __thaw_task() will wake us up */
+ current->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&current->pi_lock);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +106,50 @@ static void fake_signal_wake_up(struct task_struct *p)
}
}
+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * Allow freezing the sched_delayed tasks; they will not execute until
+ * ttwu() fixes them up, so it is safe to swap their state now, instead
+ * of waiting for them to get fully dequeued.
+ */
+ if (task_is_runnable(p))
+ return 0;
+
+ if (p != current && task_curr(p))
+ return 0;
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ p->saved_state = p->__state;
+ WRITE_ONCE(p->__state, TASK_FROZEN);
+ return TASK_FROZEN;
+}
+
+static bool __freeze_task(struct task_struct *p)
+{
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -116,20 +165,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -137,20 +174,54 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
+ */
+static int __restore_freezer_state(struct task_struct *p, void *arg)
+{
+ unsigned int state = p->saved_state;
+
+ if (state != TASK_RUNNING) {
+ WRITE_ONCE(p->__state, state);
+ p->saved_state = TASK_RUNNING;
+ return 1;
+ }
+
+ return 0;
+}
+
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
+ guard(spinlock_irqsave)(&freezer_lock);
+ if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL))
+ wake_up_state(p, TASK_FROZEN);
+}
- spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
- spin_unlock_irqrestore(&freezer_lock, flags);
+/*
+ * thaw_process - Thaw a frozen process
+ * @p: the process to be thawed
+ *
+ * Iterate over all threads of @p and call __thaw_task() on each.
+ */
+void thaw_process(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ rcu_read_lock();
+ for_each_thread(p, t) {
+ __thaw_task(t);
+ }
+ rcu_read_unlock();
}
/**
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 51dd822a8060..cf7e610eac42 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,9 +34,16 @@
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/plist.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
+#include <linux/mempolicy.h>
+#include <linux/mmap_lock.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -47,12 +54,23 @@
* reside in the same cacheline.
*/
static struct {
- struct futex_hash_bucket *queues;
- unsigned long hashsize;
+ unsigned long hashmask;
+ unsigned int hashshift;
+ struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashmask (__futex_data.hashmask)
+#define futex_hashshift (__futex_data.hashshift)
+#define futex_queues (__futex_data.queues)
+
+struct futex_private_hash {
+ int state;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ void *mm;
+ bool custom;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -105,21 +123,331 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
+static bool futex_private_hash_get(struct futex_private_hash *fph)
+{
+ return futex_ref_get(fph);
+}
+
+void futex_private_hash_put(struct futex_private_hash *fph)
+{
+ if (futex_ref_put(fph))
+ wake_up_var(fph->mm);
+}
+
/**
- * futex_hash - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
+ * futex_hash_get - Get an additional reference for the local hash.
+ * @hb: ptr to the private local hash.
*
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
+ * Obtain an additional reference for the already obtained hash bucket. The
+ * caller must already own an reference.
*/
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ futex_private_hash_put(fph);
+}
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ u32 hash;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (!fph || !fph->hash_mask)
+ return NULL;
+
+ hash = jhash2((void *)&key->private.address,
+ sizeof(key->private.address) / 4,
+ key->both.offset);
+ return &fph->queues[hash & fph->hash_mask];
+}
+
+static void futex_rehash_private(struct futex_private_hash *old,
+ struct futex_private_hash *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = __futex_hash(&this->key, new);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+ struct futex_private_hash *new)
+{
+ struct futex_private_hash *fph;
+
+ WARN_ON_ONCE(mm->futex_phash_new);
+
+ fph = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (fph) {
+ if (!futex_ref_is_dead(fph)) {
+ mm->futex_phash_new = new;
+ return false;
+ }
+
+ futex_rehash_private(fph, new);
+ }
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
+ kvfree_rcu(fph, rcu);
+ return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *fph;
+
+ fph = mm->futex_phash_new;
+ if (fph) {
+ mm->futex_phash_new = NULL;
+ __futex_pivot_hash(mm, fph);
+ }
+ }
+}
+
+struct futex_private_hash *futex_private_hash(void)
+{
+ struct mm_struct *mm = current->mm;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new private hash is already prepared and a reference can't be
+ * obtained once the last user dropped it's.
+ * In that case we block on mm_struct::futex_hash_lock and either have
+ * to perform the replacement or wait while someone else is doing the
+ * job. Eitherway, on the second iteration we acquire a reference on the
+ * new private hash or loop again because a new replacement has been
+ * requested.
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_private_hash *fph;
+
+ fph = rcu_dereference(mm->futex_phash);
+ if (!fph)
+ return NULL;
+
+ if (futex_private_hash_get(fph))
+ return fph;
+ }
+ futex_pivot_hash(mm);
+ goto again;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ struct futex_private_hash *fph;
+ struct futex_hash_bucket *hb;
+
+again:
+ scoped_guard(rcu) {
+ hb = __futex_hash(key, NULL);
+ fph = hb->priv;
+
+ if (!fph || futex_private_hash_get(fph))
+ return hb;
+ }
+ futex_pivot_hash(key->private.mm);
+ goto again;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ return NULL;
+}
+
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
+ return __futex_hash(key, NULL);
+}
+
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+#ifdef CONFIG_FUTEX_MPOL
+
+static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct mempolicy *mpol;
+ int node = FUTEX_NO_NODE;
+
+ if (!vma)
+ return FUTEX_NO_NODE;
+
+ mpol = vma_policy(vma);
+ if (!mpol)
+ return FUTEX_NO_NODE;
+
+ switch (mpol->mode) {
+ case MPOL_PREFERRED:
+ node = first_node(mpol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_BIND:
+ if (mpol->home_node != NUMA_NO_NODE)
+ node = mpol->home_node;
+ break;
+ default:
+ break;
+ }
+
+ return node;
+}
+
+static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
+{
+ int seq, node;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return -EBUSY;
+
+ node = __futex_key_to_node(mm, addr);
+
+ if (mmap_lock_speculate_retry(mm, seq))
+ return -EAGAIN;
+
+ return node;
+}
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ int node;
+
+ node = futex_key_to_node_opt(mm, addr);
+ if (node >= FUTEX_NO_NODE)
+ return node;
- return &futex_queues[hash & (futex_hashsize - 1)];
+ guard(mmap_read_lock)(mm);
+ return __futex_key_to_node(mm, addr);
}
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
+
+/**
+ * __futex_hash - Return the hash bucket
+ * @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
+ */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+{
+ int node = key->both.node;
+ u32 hash;
+
+ if (node == FUTEX_NO_NODE) {
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+ }
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / sizeof(u32),
+ key->both.offset);
+
+ if (node == FUTEX_NO_NODE) {
+ /*
+ * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+ * node -- this ensures regular futexes are interleaved across
+ * the nodes and avoids having to allocate multiple
+ * hash-tables.
+ *
+ * NOTE: this isn't perfectly uniform, but it is fast and
+ * handles sparse node masks.
+ */
+ node = (hash >> futex_hashshift) % nr_node_ids;
+ if (!node_possible(node)) {
+ node = find_next_bit_wrap(node_possible_map.bits,
+ nr_node_ids, node);
+ }
+ }
+
+ return &futex_queues[node][hash & futex_hashmask];
+}
/**
* futex_setup_timer - set up the sleeping hrtimer.
@@ -138,9 +466,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
if (!time)
return NULL;
- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
+ hrtimer_setup_sleeper_on_stack(timeout,
+ (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
/*
* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
* effectively the same as calling hrtimer_set_expires().
@@ -179,12 +507,12 @@ static u64 get_inode_sequence_number(struct inode *inode)
return old;
for (;;) {
- u64 new = atomic64_add_return(1, &i_seq);
+ u64 new = atomic64_inc_return(&i_seq);
if (WARN_ON_ONCE(!new))
continue;
- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
- if (old)
+ old = 0;
+ if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
return old;
return new;
}
@@ -193,7 +521,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @flags: FLAGS_*
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
@@ -204,7 +532,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* For shared mappings (when @fshared), the key is:
*
- * ( inode->i_sequence, page->index, offset_within_page )
+ * ( inode->i_sequence, page offset within mapping, offset_within_page )
*
* [ also see get_inode_sequence_number() ]
*
@@ -217,29 +545,68 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *tail;
+ struct page *page;
+ struct folio *folio;
struct address_space *mapping;
- int err, ro = 0;
+ int node, err, size, ro = 0;
+ bool node_updated = false;
+ bool fshared;
+
+ fshared = flags & FLAGS_SHARED;
+ size = futex_size(flags);
+ if (flags & FLAGS_NUMA)
+ size *= 2;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address % size) != 0))
return -EINVAL;
address -= key->both.offset;
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
+ node = FUTEX_NO_NODE;
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (get_user_inline(node, naddr))
+ return -EFAULT;
+
+ if ((node != FUTEX_NO_NODE) &&
+ ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
+ return -EINVAL;
+ }
+
+ if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
+ node = futex_mpol(mm, address);
+ node_updated = true;
+ }
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (node == FUTEX_NO_NODE) {
+ node = numa_node_id();
+ node_updated = true;
+ }
+ if (node_updated && put_user_inline(node, naddr))
+ return -EFAULT;
+ }
+
+ key->both.node = node;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -248,7 +615,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- key->private.mm = mm;
+ /*
+ * On no-MMU, shared futexes are treated as private, therefore
+ * we must not include the current process in the key. Since
+ * there is only one address space, the address is a unique key
+ * on its own.
+ */
+ if (IS_ENABLED(CONFIG_MMU))
+ key->private.mm = mm;
+ else
+ key->private.mm = NULL;
+
key->private.address = address;
return 0;
}
@@ -273,54 +650,52 @@ again:
err = 0;
/*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
+ * The treatment of mapping from this point on is critical. The folio
+ * lock protects many things but in this context the folio lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
+ * Strictly speaking the folio lock is not needed in all cases being
+ * considered here and folio lock forces unnecessarily serialization.
* From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
+ * folio lock will be acquired only if it is unavoidable
*
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
+ * Mapping checks require the folio so it is looked up now. For
+ * anonymous pages, it does not matter if the folio is split
+ * in the future as the key is based on the address. For
+ * filesystem-backed pages, the precise page is required as the
+ * index of the page determines the key.
*/
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
+ folio = page_folio(page);
+ mapping = READ_ONCE(folio->mapping);
/*
- * If page->mapping is NULL, then it cannot be a PageAnon
+ * If folio->mapping is NULL, then it cannot be an anonymous
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
+ * invalidate_complete_page2 before we got the folio lock (also
* cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
+ * so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
+ * an unlikely race, but we do need to retry for folio->mapping.
*/
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
+ * Folio lock is required to identify which special case above
+ * applies. If this is really a shmem page then the folio lock
* will prevent unexpected transitions.
*/
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
if (shmem_swizzled)
goto again;
@@ -331,14 +706,14 @@ again:
/*
* Private mappings are handled in a simple way.
*
- * If the futex key is stored on an anonymous page, then the associated
+ * If the futex key is stored in anonymous memory, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -357,10 +732,10 @@ again:
/*
* The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
+ * the folio->mapping must be traversed. Ordinarily this should
+ * be stabilised under folio lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
+ * update i_pages or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
@@ -368,9 +743,9 @@ again:
*/
rcu_read_lock();
- if (READ_ONCE(page->mapping) != mapping) {
+ if (READ_ONCE(folio->mapping) != mapping) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
@@ -378,19 +753,19 @@ again:
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = page_to_pgoff(tail);
+ key->shared.pgoff = page_pgoff(folio, page);
rcu_read_unlock();
}
out:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -437,28 +812,6 @@ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *
return NULL;
}
-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-int futex_get_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
/**
* wait_for_owner_exiting - Block until the owner has exited
* @ret: owner's current futex lock status
@@ -510,13 +863,9 @@ void __futex_unqueue(struct futex_q *q)
}
/* The key must be already stored in q->key. */
-struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
__acquires(&hb->lock)
{
- struct futex_hash_bucket *hb;
-
- hb = futex_hash(&q->key);
-
/*
* Increment the counter before taking the lock so that
* a potential waker won't miss a to-be-slept task that is
@@ -530,17 +879,17 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
- return hb;
}
void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
- spin_unlock(&hb->lock);
futex_hb_waiters_dec(hb);
+ spin_unlock(&hb->lock);
}
-void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
{
int prio;
@@ -556,7 +905,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
- q->task = current;
+ q->task = task;
}
/**
@@ -575,6 +924,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr;
int ret = 0;
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
@@ -613,13 +964,40 @@ retry:
return ret;
}
+void futex_q_lockptr_lock(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+
+ /*
+ * See futex_unqueue() why lock_ptr can change.
+ */
+ guard(rcu)();
+retry:
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ spin_lock(lock_ptr);
+
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+}
+
/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
void futex_unqueue_pi(struct futex_q *q)
{
- __futex_unqueue(q);
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
@@ -638,6 +1016,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
+ pid_t owner;
int err;
/* Futex address must be 32bit aligned */
@@ -659,6 +1038,10 @@ retry:
* 2. A woken up waiter is killed before it can acquire the
* futex in user space.
*
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
* In both cases the TID validation below prevents a wakeup of
* potential waiters which can cause these waiters to block
* forever.
@@ -667,24 +1050,28 @@ retry:
*
* 1) task->robust_list->list_op_pending != NULL
* @pending_op == true
- * 2) User space futex value == 0
+ * 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
*
* If these conditions are met, it is safe to attempt waking up a
* potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
*/
- if (pending_op && !pi && !uval) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
return 0;
}
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ if (owner != task_pid_vnr(curr))
return 0;
/*
@@ -732,8 +1119,10 @@ retry:
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
return 0;
}
@@ -936,10 +1325,20 @@ static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
/*
+ * The mutex mm_struct::futex_hash_lock might be acquired.
+ */
+ might_sleep();
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ guard(private_hash)();
+ /*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
@@ -949,50 +1348,52 @@ static void exit_pi_state_list(struct task_struct *curr)
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
- hb = futex_hash(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ if (1) {
+ CLASS(hb, hb)(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
}
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1112,30 +1513,500 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
+ struct futex_private_hash *fph)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ fhb->priv = fph;
+#endif
+ atomic_set(&fhb->waiters, 0);
+ plist_head_init(&fhb->chain);
+ spin_lock_init(&fhb->lock);
+}
+
+#define FH_CUSTOM 0x01
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ mm->futex_ref = NULL;
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ return 0;
+}
+
+void futex_hash_free(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ free_percpu(mm->futex_ref);
+ kvfree(mm->futex_phash_new);
+ fph = rcu_dereference_raw(mm->futex_phash);
+ if (fph)
+ kvfree(fph);
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+
+ if (!mm->futex_phash_new)
+ return true;
+
+ fph = rcu_dereference(mm->futex_phash);
+ return futex_ref_is_dead(fph);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+ struct futex_private_hash *b)
+{
+ /* user provided always wins */
+ if (!a->custom && b->custom)
+ return true;
+ if (a->custom && !b->custom)
+ return false;
+
+ /* zero-sized hash wins */
+ if (!b->hash_mask)
+ return true;
+ if (!a->hash_mask)
+ return false;
+
+ /* keep the biggest */
+ if (a->hash_mask < b->hash_mask)
+ return true;
+ if (a->hash_mask > b->hash_mask)
+ return false;
+
+ return false; /* equal */
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+ bool custom = flags & FH_CUSTOM;
+ int i;
+
+ if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+ return -EINVAL;
+
+ /*
+ * Once we've disabled the global hash there is no way back.
+ */
+ scoped_guard(rcu) {
+ fph = rcu_dereference(mm->futex_phash);
+ if (fph && !fph->hash_mask) {
+ if (custom)
+ return -EBUSY;
+ return 0;
+ }
+ }
+
+ if (!mm->futex_ref) {
+ /*
+ * This will always be allocated by the first thread and
+ * therefore requires no locking.
+ */
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ }
+
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!fph)
+ return -ENOMEM;
+
+ fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+ fph->custom = custom;
+ fph->mm = mm;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fph->queues[i], fph);
+
+ if (custom) {
+ /*
+ * Only let prctl() wait / retry; don't unduly delay clone().
+ */
+again:
+ wait_var_event(mm, futex_pivot_pending(mm));
+ }
+
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *free __free(kvfree) = NULL;
+ struct futex_private_hash *cur, *new;
+
+ cur = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ new = mm->futex_phash_new;
+ mm->futex_phash_new = NULL;
+
+ if (fph) {
+ if (cur && !cur->hash_mask) {
+ /*
+ * If two threads simultaneously request the global
+ * hash then the first one performs the switch,
+ * the second one returns here.
+ */
+ free = fph;
+ mm->futex_phash_new = new;
+ return -EBUSY;
+ }
+ if (cur && !new) {
+ /*
+ * If we have an existing hash, but do not yet have
+ * allocated a replacement hash, drop the initial
+ * reference on the existing hash.
+ */
+ futex_ref_drop(cur);
+ }
+
+ if (new) {
+ /*
+ * Two updates raced; throw out the lesser one.
+ */
+ if (futex_hash_less(new, fph)) {
+ free = new;
+ new = fph;
+ } else {
+ free = fph;
+ }
+ } else {
+ new = fph;
+ }
+ fph = NULL;
+ }
+
+ if (new) {
+ /*
+ * Will set mm->futex_phash_new on failure;
+ * futex_private_hash_get() will try again.
+ */
+ if (!__futex_pivot_hash(mm, new) && custom)
+ goto again;
+ }
+ }
+ return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+ unsigned int threads, buckets, current_buckets = 0;
+ struct futex_private_hash *fph;
+
+ if (!current->mm)
+ return 0;
+
+ scoped_guard(rcu) {
+ threads = min_t(unsigned int,
+ get_nr_threads(current),
+ num_online_cpus());
+
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph) {
+ if (fph->custom)
+ return 0;
+
+ current_buckets = fph->hash_mask + 1;
+ }
+ }
+
+ /*
+ * The default allocation will remain within
+ * 16 <= threads * 4 <= global hash size
+ */
+ buckets = roundup_pow_of_two(4 * threads);
+ buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+ if (current_buckets >= buckets)
+ return 0;
+
+ return futex_hash_allocate(buckets, 0);
+}
+
+static int futex_hash_get_slots(void)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph && fph->hash_mask)
+ return fph->hash_mask + 1;
+ return 0;
+}
+
+#else
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ return -EINVAL;
+}
+
+static int futex_hash_get_slots(void)
+{
+ return 0;
+}
+
+#endif
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
+{
+ unsigned int flags = FH_CUSTOM;
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ if (arg4)
+ return -EINVAL;
+ ret = futex_hash_allocate(arg3, flags);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
- unsigned int futex_shift;
- unsigned long i;
+ unsigned long hashsize, i;
+ unsigned int order, n;
+ unsigned long size;
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
+#ifdef CONFIG_BASE_SMALL
+ hashsize = 16;
#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ hashsize = 256 * num_possible_cpus();
+ hashsize /= num_possible_nodes();
+ hashsize = max(4, hashsize);
+ hashsize = roundup_pow_of_two(hashsize);
#endif
+ futex_hashshift = ilog2(hashsize);
+ size = sizeof(struct futex_hash_bucket) * hashsize;
+ order = get_order(size);
+
+ for_each_node(n) {
+ struct futex_hash_bucket *table;
+
+ if (order > MAX_PAGE_ORDER)
+ table = vmalloc_huge_node(size, GFP_KERNEL, n);
+ else
+ table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+ BUG_ON(!table);
+
+ for (i = 0; i < hashsize; i++)
+ futex_hash_bucket_init(&table[i], NULL);
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
+ futex_queues[n] = table;
}
+ futex_hashmask = hashsize - 1;
+ pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
+ hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
+ order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c264cbeab71c..30c2afa03889 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -3,7 +3,11 @@
#define _FUTEX_H
#include <linux/futex.h>
+#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
@@ -15,17 +19,103 @@
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
+#define FLAGS_SIZE_8 0x0000
+#define FLAGS_SIZE_16 0x0001
+#define FLAGS_SIZE_32 0x0002
+#define FLAGS_SIZE_64 0x0003
+
+#define FLAGS_SIZE_MASK 0x0003
+
#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
+# define FLAGS_SHARED 0x0010
#else
/*
* NOMMU does not have per process address space. Let the compiler optimize
* code away.
*/
-# define FLAGS_SHARED 0x00
+# define FLAGS_SHARED 0x0000
#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
+#define FLAGS_CLOCKRT 0x0020
+#define FLAGS_HAS_TIMEOUT 0x0040
+#define FLAGS_NUMA 0x0080
+#define FLAGS_STRICT 0x0100
+#define FLAGS_MPOL 0x0200
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+ unsigned int flags = FLAGS_SIZE_32;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ flags |= FLAGS_CLOCKRT;
+
+ return flags;
+}
+
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+ unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+ if (!(flags2 & FUTEX2_PRIVATE))
+ flags |= FLAGS_SHARED;
+
+ if (flags2 & FUTEX2_NUMA)
+ flags |= FLAGS_NUMA;
+
+ if (flags2 & FUTEX2_MPOL)
+ flags |= FLAGS_MPOL;
+
+ return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+ return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+ /* Only 64bit futexes for 64bit code */
+ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+ if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+ return false;
+ }
+
+ /* Only 32bit futexes are implemented -- for now */
+ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+ return false;
+
+ /*
+ * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
+ * in a futex word.
+ */
+ if (flags & FLAGS_NUMA) {
+ int bits = 8 * futex_size(flags);
+ u64 max = ~0ULL;
+
+ max >>= 64 - bits;
+ if (nr_node_ids >= max)
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+ int bits = 8 * futex_size(flags);
+
+ if (bits < 64 && (val >> bits))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
@@ -45,6 +135,7 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
+ struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;
/*
@@ -68,17 +159,23 @@ struct futex_pi_state {
union futex_key key;
} __randomize_layout;
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
+ * @wake: the wake handler for this queue
+ * @wake_data: data associated with the wake handler
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi()
+ * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -97,12 +194,15 @@ struct futex_q {
struct task_struct *task;
spinlock_t *lock_ptr;
+ futex_wake_fn *wake;
+ void *wake_data;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
atomic_t requeue_state;
+ bool drop_hb_ref;
#ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait;
#endif
@@ -115,14 +215,35 @@ enum futex_access {
FUTEX_WRITE
};
-extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
-
+extern void futex_q_lockptr_lock(struct futex_q *q);
extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern void futex_hash_get(struct futex_hash_bucket *hb);
+extern void futex_hash_put(struct futex_hash_bucket *hb);
+
+extern struct futex_private_hash *futex_private_hash(void);
+extern void futex_private_hash_put(struct futex_private_hash *fph);
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
+static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
+#endif
+
+DEFINE_CLASS(hb, struct futex_hash_bucket *,
+ if (_T) futex_hash_put(_T),
+ futex_hash(key), union futex_key *key);
+
+DEFINE_CLASS(private_hash, struct futex_private_hash *,
+ if (_T) futex_private_hash_put(_T),
+ futex_private_hash(), void);
/**
* futex_match - Check whether two futex keys are equal
@@ -140,24 +261,43 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2)
}
extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb);
-extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout);
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task);
+extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
-extern int futex_get_value_locked(u32 *dest, u32 __user *from);
extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+/* Read from user memory with pagefaults disabled */
+static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ guard(pagefault)();
+ return get_user_inline(*dest, from);
+}
+
extern void __futex_unqueue(struct futex_q *q);
-extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);
/**
* futex_queue() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
+ * @task: Task queueing this futex
*
* The hb->lock must be held by the caller, and is released here. A call to
* futex_queue() is typically paired with exactly one call to futex_unqueue(). The
@@ -165,11 +305,14 @@ extern int futex_unqueue(struct futex_q *q);
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
+ *
+ * Note that @task may be NULL, for async usage of futexes.
*/
-static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
__releases(&hb->lock)
{
- __futex_queue(q, hb);
+ __futex_queue(q, hb, task);
spin_unlock(&hb->lock);
}
@@ -215,7 +358,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
extern void futex_q_unlock(struct futex_hash_bucket *hb);
@@ -259,10 +402,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
val, ktime_t *abs_time, u32 bitset, u32 __user
*uaddr2);
-extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue,
u32 *cmpval, int requeue_pi);
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset);
+
extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset);
@@ -278,6 +425,16 @@ struct futex_vector {
struct futex_q q;
};
+extern int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data);
+
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+ int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 183b28c32c83..dacb2330f1fb 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/slab.h>
+#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include "futex.h"
@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
/*
* Caller must hold a reference on @pi_state.
*/
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_RT_WAKE_Q(wqh);
u32 curval, newval;
int ret = 0;
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
new_owner = top_waiter->task;
/*
@@ -818,7 +806,7 @@ handle_err:
break;
}
- spin_lock(q->lock_ptr);
+ futex_q_lockptr_lock(q);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
@@ -932,8 +920,8 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
struct hrtimer_sleeper timeout, *to;
struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
+ DEFINE_WAKE_Q(wake_q);
int res, ret;
if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -945,132 +933,188 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
retry_private:
- hb = futex_q_lock(&q);
+ if (1) {
+ CLASS(hb, hb)(&q.key);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- futex_q_unlock(hb);
+ futex_q_lock(&q, hb);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
/*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
*/
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
}
- }
- WARN_ON(!q.pi_state);
+ WARN_ON(!q.pi_state);
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __futex_queue(&q, hb);
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb, current);
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
- rt_mutex_init_waiter(&rt_waiter);
+ /*
+ * Caution; releasing @hb in-scope. The hb->lock is still locked
+ * while the reference is dropped. The reference can not be dropped
+ * after the unlock because if a user initiated resize is in progress
+ * then we might need to wake him. This can not be done after the
+ * rt_mutex_pre_schedule() invocation. The hb will remain valid because
+ * the thread, performing resize, will block on hb->lock during
+ * the requeue.
+ */
+ futex_hash_put(no_free_ptr(hb));
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
- /*
- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+ rt_mutex_init_waiter(&rt_waiter);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ futex_q_lockptr_lock(&q);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_pi_owner(uaddr, &q, !ret);
- /*
- * If fixup_pi_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- futex_unqueue_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from futex_unlock_pi() */
+ futex_hash_put(hb);
+ }
+ goto out;
out_unlock_put_key:
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ goto out;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
out:
if (to) {
@@ -1078,18 +1122,6 @@ out:
destroy_hrtimer_on_stack(&to->timer);
}
return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- futex_q_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
}
/*
@@ -1101,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
int ret;
@@ -1117,12 +1148,13 @@ retry:
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
if (ret)
return ret;
- hb = futex_hash(&key);
+ CLASS(hb, hb)(&key);
spin_lock(&hb->lock);
+retry_hb:
/*
* Check waiters first. We do not trust user space values at
@@ -1132,6 +1164,7 @@ retry:
top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) {
struct futex_pi_state *pi_state = top_waiter->pi_state;
+ struct rt_mutex_waiter *rt_waiter;
ret = -EINVAL;
if (!pi_state)
@@ -1144,22 +1177,50 @@ retry:
if (pi_state->owner != current)
goto out_unlock;
- get_pi_state(pi_state);
/*
* By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_p() must observe a state consistent with what we
- * observed.
+ * there is no point where we hold neither; and thereby
+ * wake_futex_pi() must observe any new waiters.
+ *
+ * Since the cleanup: case in futex_lock_pi() removes the
+ * rt_waiter without holding hb->lock, it is possible for
+ * wake_futex_pi() to not find a waiter while the above does,
+ * in this case the waiter is on the way out and it can be
+ * ignored.
*
* In particular; this forces __rt_mutex_start_proxy() to
* complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
+ * rt_waiter.
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+ * waiters even though futex thinks there are, then the waiter
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
+ */
+ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (!rt_waiter) {
+ /*
+ * Acquire a reference for the leaving waiter to ensure
+ * valid futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ top_waiter->drop_hb_ref = true;
+ __futex_unqueue(top_waiter);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ goto retry_hb;
+ }
+
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
+ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
put_pi_state(pi_state);
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..d818b4d47f1b 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/signal.h>
#include "futex.h"
@@ -58,6 +59,7 @@ enum {
const struct futex_q futex_q_init = {
/* list gets initialized in futex_queue()*/
+ .wake = futex_wake_mark,
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY,
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@@ -85,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
}
q->key = *key2;
}
@@ -223,18 +230,25 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
- q->key = *key;
+ struct task_struct *task;
+ q->key = *key;
__futex_unqueue(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
-
+ /*
+ * Acquire a reference for the waiter to ensure valid
+ * futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ q->drop_hb_ref = true;
q->lock_ptr = &hb->lock;
+ task = READ_ONCE(q->task);
/* Signal locked state to the waiter */
futex_requeue_pi_complete(q, 1);
- wake_up_state(q->task, TASK_NORMAL);
+ wake_up_state(task, TASK_NORMAL);
}
/**
@@ -269,7 +283,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
union futex_key *key2, struct futex_pi_state **ps,
struct task_struct **exiting, int set_waiters)
{
- struct futex_q *top_waiter = NULL;
+ struct futex_q *top_waiter;
u32 curval;
int ret;
@@ -346,8 +360,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @flags1: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
+ * @flags2: futex flags (FLAGS_SHARED, etc.)
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
@@ -361,13 +376,13 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* - >=0 - on success, the number of tasks requeued or woken;
* - <0 - on error
*/
-int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
@@ -424,10 +439,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
}
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ ret = get_futex_key(uaddr2, flags2, &key2,
requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0))
return ret;
@@ -439,240 +454,242 @@ retry:
if (requeue_pi && futex_match(&key1, &key2))
return -EINVAL;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- futex_hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
- if (likely(cmpval != NULL)) {
- u32 curval;
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
- ret = futex_get_value_locked(&curval, uaddr1);
+ if (likely(cmpval != NULL)) {
+ u32 curval;
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
+ ret = futex_get_value_locked(&curval, uaddr1);
- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
+ if (unlikely(ret)) {
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;
- if (requeue_pi) {
- struct task_struct *exiting = NULL;
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- *
- * Updates topwaiter::requeue_state if a top waiter exists.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
- /*
- * At this point the top_waiter has either taken uaddr2 or
- * is waiting on it. In both cases pi_state has been
- * established and an initial refcount on it. In case of an
- * error there's nothing.
- *
- * The top waiter's requeue_state is up to date:
- *
- * - If the lock was acquired atomically (ret == 1), then
- * the state is Q_REQUEUE_PI_LOCKED.
- *
- * The top waiter has been dequeued and woken up and can
- * return to user space immediately. The kernel/user
- * space state is consistent. In case that there must be
- * more waiters requeued the WAITERS bit in the user
- * space futex is set so the top waiter task has to go
- * into the syscall slowpath to unlock the futex. This
- * will block until this requeue operation has been
- * completed and the hash bucket locks have been
- * dropped.
- *
- * - If the trylock failed with an error (ret < 0) then
- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
- * happened", or Q_REQUEUE_PI_IGNORE when there was an
- * interleaved early wakeup.
- *
- * - If the trylock did not succeed (ret == 0) then the
- * state is either Q_REQUEUE_PI_IN_PROGRESS or
- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
- * This will be cleaned up in the loop below, which
- * cannot fail because futex_proxy_trylock_atomic() did
- * the same sanity checks for requeue_pi as the loop
- * below does.
- */
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- case 1:
/*
- * futex_proxy_trylock_atomic() acquired the user space
- * futex. Adjust task_count.
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
*/
- task_count++;
- ret = 0;
- break;
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
- /*
- * If the above failed, then pi_state is NULL and
- * waiter::requeue_state is correct.
- */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
/*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
*/
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!futex_match(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /* Plain futexes just wake or requeue and are done */
- if (!requeue_pi) {
- if (++task_count <= nr_wake)
- futex_wake_mark(&wake_q, this);
- else
- requeue_futex(this, hb1, hb2, &key2);
- continue;
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
}
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (!futex_match(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- *
- * Prepare the waiter to take the rt_mutex. Take a refcount
- * on the pi_state and store the pointer in the futex_q
- * object of the waiter.
- */
- get_pi_state(pi_state);
+ if (!futex_match(&this->key, &key1))
+ continue;
- /* Don't requeue when the waiter is already on the way out. */
- if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
- * Early woken waiter signaled that it is on the
- * way out. Drop the pi_state reference and try the
- * next waiter. @this->pi_state is still NULL.
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
- put_pi_state(pi_state);
- continue;
- }
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
-
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the refcount
- * on pi_state nor clear this->pi_state because the
- * waiter needs the pi_state for cleaning up the
- * user space value. It will drop the refcount
- * after doing so. this::requeue_state is updated
- * in the wakeup as well.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- task_count++;
- } else if (!ret) {
- /* Waiter is queued, move it to hb2 */
- requeue_futex(this, hb1, hb2, &key2);
- futex_requeue_pi_complete(this, 0);
- task_count++;
- } else {
- /*
- * rt_mutex_start_proxy_lock() detected a potential
- * deadlock when we tried to queue that waiter.
- * Drop the pi_state reference which we took above
- * and remove the pointer to the state from the
- * waiters futex_q object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- futex_requeue_pi_complete(this, ret);
/*
- * We stop queueing more waiters and let user space
- * deal with the mess.
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
*/
- break;
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
}
- }
- /*
- * We took an extra initial reference to the pi_state in
- * futex_proxy_trylock_atomic(). We need to drop it here again.
- */
- put_pi_state(pi_state);
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
out_unlock:
- double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
- futex_hb_waiters_dec(hb2);
return ret ? ret : task_count;
}
@@ -765,7 +782,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
struct rt_mutex_base *pi_mutex;
@@ -789,7 +805,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
rt_mutex_init_waiter(&rt_waiter);
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -801,35 +817,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);
if (ret)
goto out;
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (futex_match(&q.key, &key2)) {
- futex_q_unlock(hb);
- ret = -EINVAL;
- goto out;
- }
-
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
- /* The waiter is still on uaddr1 */
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, to);
- spin_unlock(&hb->lock);
+ {
+ CLASS(hb, hb)(&q.key);
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ }
break;
case Q_REQUEUE_PI_LOCKED:
/* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ futex_q_lockptr_lock(&q);
ret = fixup_pi_owner(uaddr2, &q, true);
/*
* Drop the reference to the pi state which the
@@ -850,11 +859,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
- /* Current is not longer pi_blocked_on */
- spin_lock(q.lock_ptr);
+ /*
+ * See futex_unlock_pi()'s cleanup: comment.
+ */
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
+ futex_q_lockptr_lock(&q);
debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
@@ -886,6 +897,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default:
BUG();
}
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_hash_put(hb);
+ }
out:
if (to) {
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 086a22d1adb7..880c9bf2f315 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/time_namespace.h>
@@ -40,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
return 0;
}
+static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
+{
+#ifdef CONFIG_COMPAT
+ if (compat)
+ return p->compat_robust_list;
+#endif
+ return p->robust_list;
+}
+
+static void __user *futex_get_robust_list_common(int pid, bool compat)
+{
+ struct task_struct *p = current;
+ void __user *head;
+ int ret;
+
+ scoped_guard(rcu) {
+ if (pid) {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ return (void __user *)ERR_PTR(-ESRCH);
+ }
+ get_task_struct(p);
+ }
+
+ /*
+ * Hold exec_update_lock to serialize with concurrent exec()
+ * so ptrace_may_access() is checked against stable credentials
+ */
+ ret = down_read_killable(&p->signal->exec_update_lock);
+ if (ret)
+ goto err_put;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = futex_task_robust_list(p, compat);
+
+ up_read(&p->signal->exec_update_lock);
+ put_task_struct(p);
+
+ return head;
+
+err_unlock:
+ up_read(&p->signal->exec_update_lock);
+err_put:
+ put_task_struct(p);
+ return (void __user *)ERR_PTR(ret);
+}
+
/**
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
@@ -50,50 +99,25 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
size_t __user *, len_ptr)
{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
- head = p->robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
+ unsigned int flags = futex_to_flags(op);
int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ if (flags & FLAGS_CLOCKRT) {
+ if (cmd != FUTEX_WAIT_BITSET &&
+ cmd != FUTEX_WAIT_REQUEUE_PI &&
cmd != FUTEX_LOCK_PI2)
return -ENOSYS;
}
@@ -110,9 +134,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
@@ -129,7 +153,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
}
return -ENOSYS;
}
@@ -183,43 +207,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
-/* Mask of available flags for each futex in futex_waitv list */
-#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
-
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
+ * @wake: Wake to call when futex is woken
+ * @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
-static int futex_parse_waitv(struct futex_vector *futexv,
- struct futex_waitv __user *uwaitv,
- unsigned int nr_futexes)
+int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
for (i = 0; i < nr_futexes; i++) {
+ unsigned int flags;
+
if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
return -EFAULT;
- if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+ if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ flags = futex2_to_flags(aux.flags);
+ if (!futex_flags_valid(flags))
return -EINVAL;
- if (!(aux.flags & FUTEX_32))
+ if (!futex_validate_input(flags, aux.val))
return -EINVAL;
- futexv[i].w.flags = aux.flags;
+ futexv[i].w.flags = flags;
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
+ futexv[i].q.wake = wake;
+ futexv[i].q.wake_data = wake_data;
+ }
+
+ return 0;
+}
+
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+ clockid_t clockid, struct hrtimer_sleeper *to)
+{
+ int flag_clkid = 0, flag_init = 0;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ if (!timeout)
+ return 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
}
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, to, flag_clkid, 0);
return 0;
}
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+}
+
/**
* sys_futex_waitv - Wait on a list of futexes
* @waiters: List of futexes to wait on
@@ -249,8 +321,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
{
struct hrtimer_sleeper to;
struct futex_vector *futexv;
- struct timespec64 ts;
- ktime_t time;
int ret;
/* This syscall supports no flags for now */
@@ -260,48 +330,142 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
return -EINVAL;
- if (timeout) {
- int flag_clkid = 0, flag_init = 0;
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
- if (clockid == CLOCK_REALTIME) {
- flag_clkid = FLAGS_CLOCKRT;
- flag_init = FUTEX_CLOCK_REALTIME;
- }
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv) {
+ ret = -ENOMEM;
+ goto destroy_timer;
+ }
- if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
- return -EINVAL;
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+ NULL);
+ if (!ret)
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
- if (get_timespec64(&ts, timeout))
- return -EFAULT;
+ kfree(futexv);
- /*
- * Since there's no opcode for futex_waitv, use
- * FUTEX_WAIT_BITSET that uses absolute timeout as well
- */
- ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
- if (ret)
- return ret;
+destroy_timer:
+ if (timeout)
+ futex2_destroy_timeout(&to);
+ return ret;
+}
- futex_setup_timer(&time, &to, flag_clkid, 0);
- }
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr: Address of the futex(es) to wake
+ * @mask: bitmask
+ * @nr: Number of the futexes to wake
+ * @flags: FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
- futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
- if (!futexv)
- return -ENOMEM;
+SYSCALL_DEFINE4(futex_wake,
+ void __user *, uaddr,
+ unsigned long, mask,
+ int, nr,
+ unsigned int, flags)
+{
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
- ret = futex_parse_waitv(futexv, waiters, nr_futexes);
- if (!ret)
- ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
- if (timeout) {
- hrtimer_cancel(&to.timer);
- destroy_hrtimer_on_stack(&to.timer);
- }
+ if (!futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr: Address of the futex to wait on
+ * @val: Value of @uaddr
+ * @mask: bitmask
+ * @flags: FUTEX2 flags
+ * @timeout: Optional absolute timeout
+ * @clockid: Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+ void __user *, uaddr,
+ unsigned long, val,
+ unsigned long, mask,
+ unsigned int, flags,
+ struct __kernel_timespec __user *, timeout,
+ clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ int ret;
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, val) ||
+ !futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+ if (timeout)
+ futex2_destroy_timeout(&to);
- kfree(futexv);
return ret;
}
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters: array describing the source and destination futex
+ * @flags: unused
+ * @nr_wake: number of futexes to wake
+ * @nr_requeue: number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+ struct futex_waitv __user *, waiters,
+ unsigned int, flags,
+ int, nr_wake,
+ int, nr_requeue)
+{
+ struct futex_vector futexes[2];
+ u32 cmpval;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!waiters)
+ return -EINVAL;
+
+ ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
+ if (ret)
+ return ret;
+
+ cmpval = futexes[0].w.val;
+
+ return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+ u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+ nr_wake, nr_requeue, &cmpval, 0);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head,
@@ -319,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
+ struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 4ce0923f1ce3..e2bbe5509ec2 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/freezer.h>
@@ -106,20 +107,11 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+bool __futex_wake_mark(struct futex_q *q)
{
- struct task_struct *p = q->task;
-
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
+ return false;
- get_task_struct(p);
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -130,6 +122,26 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
smp_store_release(&q->lock_ptr, NULL);
+ return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ get_task_struct(p);
+
+ if (!__futex_wake_mark(q)) {
+ put_task_struct(p);
+ return;
+ }
+
/*
* Queue the task for later wakeup for after we've released
* the hb->lock.
@@ -142,20 +154,22 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
- struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
- int ret;
DEFINE_WAKE_Q(wake_q);
+ int ret;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- hb = futex_hash(&key);
+ if ((flags & FLAGS_STRICT) && !nr_wake)
+ return 0;
+
+ CLASS(hb, hb)(&key);
/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -174,7 +188,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -195,13 +209,12 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
/*
* kill this print and return -EINVAL when userspace
* is sane again
*/
pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
+ current->comm, oparg);
oparg &= 31;
}
oparg = 1 << oparg;
@@ -239,80 +252,81 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
DEFINE_WAKE_Q(wake_q);
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
return ret;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
+
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
+ }
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (futex_match (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
}
- futex_wake_mark(&wake_q, this);
- if (++ret >= nr_wake)
- break;
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
}
- }
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (futex_match (&this->key, &key2)) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match(&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
}
- futex_wake_mark(&wake_q, this);
- if (++op_ret >= nr_wake2)
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
break;
}
}
- ret += op_ret;
- }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match(&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
out_unlock:
- double_unlock_hb(hb1, hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
return ret;
}
@@ -320,23 +334,12 @@ out_unlock:
static long futex_wait_restart(struct restart_block *restart);
/**
- * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
+ * futex_do_wait() - wait for wakeup, timeout, or signal
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
*/
-void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
+void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * futex_queue() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- futex_queue(q, hb);
-
/* Arm the timer */
if (timeout)
hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
@@ -352,13 +355,13 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
/**
- * unqueue_multiple - Remove various futexes from their hash bucket
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
* @v: The list of futexes to unqueue
* @count: Number of futexes in the list
*
@@ -368,7 +371,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* - >=0 - Index of the last futex that was awoken;
* - -1 - No futex was awoken
*/
-static int unqueue_multiple(struct futex_vector *v, int count)
+int futex_unqueue_multiple(struct futex_vector *v, int count)
{
int ret = -1, i;
@@ -396,14 +399,19 @@ static int unqueue_multiple(struct futex_vector *v, int count)
* - 0 - Success
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
*/
-static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
- struct futex_hash_bucket *hb;
bool retry = false;
int ret, i;
u32 uval;
/*
+ * Make sure to have a reference on the private_hash such that we
+ * don't block on rehash after changing the task state below.
+ */
+ guard(private_hash)();
+
+ /*
* Enqueuing multiple futexes is tricky, because we need to enqueue
* each futex on the list before dealing with the next one to avoid
* deadlocking on the hash bucket. But, before enqueuing, we need to
@@ -419,38 +427,42 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
*/
retry:
for (i = 0; i < count; i++) {
- if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+ if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
continue;
ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
- !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+ vs[i].w.flags,
&vs[i].q.key, FUTEX_READ);
if (unlikely(ret))
return ret;
}
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
for (i = 0; i < count; i++) {
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
struct futex_q *q = &vs[i].q;
- u32 val = (u32)vs[i].w.val;
+ u32 val = vs[i].w.val;
- hb = futex_q_lock(q);
- ret = futex_get_value_locked(&uval, uaddr);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- if (!ret && uval == val) {
- /*
- * The bucket lock can't be held while dealing with the
- * next futex. Queue each futex at this moment so hb can
- * be unlocked.
- */
- futex_queue(q, hb);
- continue;
- }
+ futex_q_lock(q, hb);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb, current);
+ continue;
+ }
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -458,7 +470,7 @@ retry:
* was woken, we don't return error and return this index to
* userspace
*/
- *woken = unqueue_multiple(vs, i);
+ *woken = futex_unqueue_multiple(vs, i);
if (*woken >= 0)
return 1;
@@ -504,7 +516,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
return;
}
- freezable_schedule();
+ schedule();
}
/**
@@ -543,7 +555,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
__set_current_state(TASK_RUNNING);
- ret = unqueue_multiple(vs, count);
+ ret = futex_unqueue_multiple(vs, count);
if (ret >= 0)
return ret;
@@ -564,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
* @val: the expected value
* @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
+ * @key2: the second futex_key if used for requeue PI
+ * @task: Task queueing this futex
*
* Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally.
@@ -572,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
*
* Return:
* - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
+ * be read, does not contain the expected value or is not properly aligned.
*/
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task)
{
u32 uval;
int ret;
@@ -599,69 +614,84 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
retry_private:
- *hb = futex_q_lock(q);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- ret = futex_get_value_locked(&uval, uaddr);
+ futex_q_lock(q, hb);
- if (ret) {
- futex_q_unlock(*hb);
+ ret = futex_get_value_locked(&uval, uaddr);
- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
+ if (ret) {
+ futex_q_unlock(hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
- goto retry;
- }
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(hb);
+ return -EWOULDBLOCK;
+ }
+
+ if (key2 && futex_match(&q->key, key2)) {
+ futex_q_unlock(hb);
+ return -EINVAL;
+ }
- if (uval != val) {
- futex_q_unlock(*hb);
- ret = -EWOULDBLOCK;
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ if (task == current)
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb, task);
}
return ret;
}
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
+
q.bitset = bitset;
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
if (ret)
- goto out;
+ return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
if (!futex_unqueue(&q))
- goto out;
- ret = -ETIMEDOUT;
+ return 0;
+
if (to && !to->task)
- goto out;
+ return -ETIMEDOUT;
/*
* We expect signal_pending(current), but we might be the
@@ -670,24 +700,38 @@ retry:
if (!signal_pending(current))
goto retry;
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
+ return -ERESTARTSYS;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ int ret;
- restart = &current->restart_block;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
- ret = set_restart_fn(restart, futex_wait_restart);
+ ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+ /* No timeout, nothing to clean up. */
+ if (!to)
+ return ret;
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+
+ if (ret == -ERESTARTSYS) {
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ return set_restart_fn(restart, futex_wait_restart);
}
+
return ret;
}
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 16f8ecc7d882..ccd02afaeffb 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
+CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations
obj-$(CONFIG_CC_IS_CLANG) += clang.o
+CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index cbb0bed958ab..8b888a6193cc 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -264,10 +264,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
/**
* gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
*
- * Adds profiling counts of @source to @dest.
+ * Adds profiling counts of @src to @dst.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
for (i = 0; i < sfn_ptr->num_counters; i++)
dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
}
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 5c3086cad8f9..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -99,7 +99,7 @@ struct gcov_iterator {
struct gcov_info *info;
size_t size;
loff_t pos;
- char buffer[];
+ char buffer[] __counted_by(size);
};
/**
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 460c12b7dfea..ffde93d051a4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,18 +18,25 @@
#include <linux/mm.h>
#include "gcov.h"
-#if (__GNUC__ >= 10)
-#define GCOV_COUNTERS 8
-#elif (__GNUC__ >= 7)
-#define GCOV_COUNTERS 9
-#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
+#if (__GNUC__ >= 15)
#define GCOV_COUNTERS 10
+#elif (__GNUC__ >= 14)
+#define GCOV_COUNTERS 9
+#elif (__GNUC__ >= 10)
+#define GCOV_COUNTERS 8
#else
#define GCOV_COUNTERS 9
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -75,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -87,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
@@ -383,12 +395,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -402,7 +420,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 1966a749e0d9..896a503dfb29 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -4,92 +4,47 @@
# This script generates an archive consisting of kernel headers
# for CONFIG_IKHEADERS.
set -e
-sfile="$(readlink -f "$0")"
-outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
-
-dir_list="
-include/
-arch/$SRCARCH/include/
-"
-
-# Support incremental builds by skipping archive generation
-# if timestamps of files being archived are not changed.
-
-# This block is useful for debugging the incremental builds.
-# Uncomment it for debugging.
-# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
-# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter
-
-all_dirs=
-if [ "$building_out_of_srctree" ]; then
- for d in $dir_list; do
- all_dirs="$all_dirs $srctree/$d"
- done
-fi
-all_dirs="$all_dirs $dir_list"
-
-# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed.
-#
-# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
-# updated, but the contents might be still the same. When any CONFIG option is
-# changed, Kconfig touches the corresponding timestamp file include/config/*.
-# Hence, the md5sum detects the configuration change anyway. We do not need to
-# check include/generated/autoconf.h explicitly.
-#
-# Ignore them for md5 calculation to avoid pointless regeneration.
-headers_md5="$(find $all_dirs -name "*.h" |
- grep -v "include/generated/compile.h" |
- grep -v "include/generated/autoconf.h" |
- xargs ls -l | md5sum | cut -d ' ' -f1)"
-
-# Any changes to this script will also cause a rebuild of the archive.
-this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
-if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
-if [ -f kernel/kheaders.md5 ] &&
- [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] &&
- [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
- [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
- exit
-fi
-
-echo " GEN $tarfile"
-
-rm -rf $cpio_dir
-mkdir $cpio_dir
-
-if [ "$building_out_of_srctree" ]; then
- (
- cd $srctree
- for f in $dir_list
- do find "$f" -name "*.h";
- done | cpio --quiet -pd $cpio_dir
- )
-fi
-
-# The second CPIO can complain if files already exist which can happen with out
-# of tree builds having stale headers in srctree. Just silence CPIO for now.
-for f in $dir_list;
- do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+srclist=$2
+objlist=$3
+timestamp=$4
+
+dir=$(dirname "${tarfile}")
+tmpdir=${dir}/.tmp_dir
+depfile=${dir}/.$(basename "${tarfile}").d
+
+# generate dependency list.
+{
+ echo
+ echo "deps_${tarfile} := \\"
+ sed 's:\(.*\): \1 \\:' "${srclist}"
+ sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}"
+ echo
+ echo "${tarfile}: \$(deps_${tarfile})"
+ echo
+ echo "\$(deps_${tarfile}):"
+
+} > "${depfile}"
+
+rm -rf "${tmpdir}"
+mkdir "${tmpdir}"
+
+# shellcheck disable=SC2154 # srctree is passed as an env variable
+sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}"
+${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}"
# Remove comments except SDPX lines
-find $cpio_dir -type f -print0 |
- xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+# Use a temporary file to store directory contents to prevent find/xargs from
+# seeing temporary files created by perl.
+find "${tmpdir}" -type f -print0 > "${tmpdir}.contents.txt"
+xargs -0 -P8 -n1 \
+ perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' \
+ < "${tmpdir}.contents.txt"
+rm -f "${tmpdir}.contents.txt"
# Create archive and try to normalize metadata for reproducibility.
-# For compatibility with older versions of tar, files are fed to tar
-# pre-sorted, as --sort=name might not be available.
-find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
- tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
- --owner=0 --group=0 --numeric-owner --no-recursion \
- -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null
-
-echo $headers_md5 > kernel/kheaders.md5
-echo "$this_file_md5" >> kernel/kheaders.md5
-echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+${TAR} "${timestamp:+--mtime=$timestamp}" \
+ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
+ -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null
-rm -rf $cpio_dir
+rm -rf "${tmpdir}"
diff --git a/kernel/groups.c b/kernel/groups.c
index 787b381c7c00..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize)
if (!gi)
return NULL;
- atomic_set(&gi->usage, 1);
+ refcount_set(&gi->usage, 1);
gi->ngroups = gidsetsize;
return gi;
}
@@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
+ const struct cred *old;
+ int retval;
new = prepare_creds();
if (!new)
return -ENOMEM;
+ old = current_cred();
+
set_groups(new, group_info);
+
+ retval = security_task_fix_setgroups(new, old);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
EXPORT_SYMBOL(set_current_groups);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 52501e5f7655..d2254c91450b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -22,13 +22,21 @@
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
+#include <linux/hung_task.h>
+#include <linux/rwsem.h>
+#include <linux/sys_info.h>
#include <trace/events/sched.h>
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Total number of tasks detected as hung since boot:
+ */
+static unsigned long __read_mostly sysctl_hung_task_detect_count;
/*
* Limit number of tasks checked in a batch.
@@ -47,17 +55,22 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
*/
-unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
-int __read_mostly sysctl_hung_task_warnings = 10;
+static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
-static bool hung_task_show_lock;
static bool hung_task_call_panic;
-static bool hung_task_show_all_bt;
static struct task_struct *watchdog_task;
+/*
+ * A bitmask to control what kinds of system info to be printed when
+ * a hung task is detected, it could be task, memory, lock etc. Refer
+ * include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long hung_task_si_mask;
+
#ifdef CONFIG_SMP
/*
* Should we dump all CPUs backtraces in a hung task event?
@@ -72,8 +85,8 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
-unsigned int __read_mostly sysctl_hung_task_panic =
- CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+static unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -87,16 +100,19 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static bool task_is_hung(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
+ unsigned int state = READ_ONCE(t->__state);
/*
- * Ensure the task is not frozen.
- * Also, skip vfork and any other user process that freezer should skip.
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (!(state & TASK_UNINTERRUPTIBLE) ||
+ (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
+ return false;
/*
* When a freshly created task is scheduled once, changes its state to
@@ -104,21 +120,128 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* musn't be checked.
*/
if (unlikely(!switch_count))
- return;
+ return false;
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
t->last_switch_time = jiffies;
- return;
+ return false;
}
if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
+{
+ struct task_struct *g, *t;
+ unsigned long owner, blocker, blocker_type;
+ const char *rwsem_blocked_by, *rwsem_blocked_as;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
+
+ blocker = READ_ONCE(task->blocker);
+ if (!blocker)
+ return;
+
+ blocker_type = hung_task_get_blocker_type(blocker);
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_SEM:
+ owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ owner = (unsigned long)rwsem_owner(
+ hung_task_blocker_to_lock(blocker));
+ rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+ "reader" : "writer";
+ rwsem_blocked_by = is_rwsem_reader_owned(
+ hung_task_blocker_to_lock(blocker)) ?
+ "reader" : "writer";
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+
+ if (unlikely(!owner)) {
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ }
+ return;
+ }
+
+ /* Ensure the owner information is correct. */
+ for_each_process_thread(g, t) {
+ if ((unsigned long)t != owner)
+ continue;
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+ task->comm, task->pid, rwsem_blocked_as, t->comm,
+ t->pid, rwsem_blocked_by);
+ break;
+ }
+ /* Avoid duplicated task dump, skip if the task is also hung. */
+ if (!task_is_hung(t, timeout))
+ sched_show_task(t);
+ return;
+ }
+}
+#else
+static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
+{
+}
+#endif
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
+{
+ unsigned long total_hung_task;
+
+ if (!task_is_hung(t, timeout))
return;
+ /*
+ * This counter tracks the total number of tasks detected as hung
+ * since boot.
+ */
+ sysctl_hung_task_detect_count++;
+
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
console_verbose();
- hung_task_show_lock = true;
hung_task_call_panic = true;
}
@@ -126,7 +249,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- if (sysctl_hung_task_warnings) {
+ if (sysctl_hung_task_warnings || hung_task_call_panic) {
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -135,13 +258,15 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
+ if (t->flags & PF_POSTCOREDUMP)
+ pr_err(" Blocked by coredump.\n");
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- hung_task_show_lock = true;
+ debug_show_blocker(t, timeout);
- if (sysctl_hung_task_all_cpu_backtrace)
- hung_task_show_all_bt = true;
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
touch_nmi_watchdog();
@@ -180,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ int need_warning = sysctl_hung_task_warnings;
+ unsigned long si_mask = hung_task_si_mask;
/*
* If the system crashed already then all bets are off,
@@ -188,9 +316,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
- hung_task_show_lock = false;
+
rcu_read_lock();
for_each_process_thread(g, t) {
+
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
@@ -198,20 +327,24 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, timeout);
+
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
- if (hung_task_show_lock)
- debug_show_all_locks();
- if (hung_task_show_all_bt) {
- hung_task_show_all_bt = false;
- trigger_all_cpu_backtrace();
+ if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ return;
+
+ if (need_warning || hung_task_call_panic) {
+ si_mask |= SYS_INFO_LOCKS;
+
+ if (sysctl_hung_task_all_cpu_backtrace)
+ si_mask |= SYS_INFO_ALL_BT;
}
+ sys_info(si_mask);
+
if (hung_task_call_panic)
panic("hung_task: blocked tasks");
}
@@ -228,8 +361,8 @@ static long hung_timeout_jiffies(unsigned long last_checked,
/*
* Process updating of timeout sysctl
*/
-static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void __user *buffer,
+static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
+ void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -250,7 +383,7 @@ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
* and hung_task_check_interval_secs
*/
static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
-static struct ctl_table hung_task_sysctls[] = {
+static const struct ctl_table hung_task_sysctls[] = {
#ifdef CONFIG_SMP
{
.procname = "hung_task_all_cpu_backtrace",
@@ -269,7 +402,7 @@ static struct ctl_table hung_task_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "hung_task_check_count",
@@ -303,7 +436,20 @@ static struct ctl_table hung_task_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_NEG_ONE,
},
- {}
+ {
+ .procname = "hung_task_detect_count",
+ .data = &sysctl_hung_task_detect_count,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "hung_task_sys_info",
+ .data = &hung_task_si_mask,
+ .maxlen = sizeof(hung_task_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
};
static void __init hung_task_sysctl_init(void)
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 62c92e43aa0d..75e61c1c6bc0 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -3,19 +3,17 @@
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
-
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
+#include <linux/ioremap.h>
#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size,
+ unsigned long flags)
{
+#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
}
#endif
@@ -94,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
- addr = arch_memremap_wb(offset, size);
+ addr = arch_memremap_wb(offset, size, flags);
}
/*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 10929eda9825..1b4254d19a73 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -6,10 +6,6 @@ menu "IRQ subsystem"
config MAY_HAVE_SPARSE_IRQ
bool
-# Legacy support, required for itanic
-config GENERIC_IRQ_LEGACY
- bool
-
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
@@ -24,6 +20,7 @@ config GENERIC_IRQ_SHOW_LEVEL
# Supports effective affinity mask
config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ depends on SMP
bool
# Support for delayed migration from interrupt context
@@ -46,10 +43,6 @@ config GENERIC_IRQ_INJECTION
config HARDIRQS_SW_RESEND
bool
-# Edge style eoi based handler (cell)
-config IRQ_EDGE_EOI_HANDLER
- bool
-
# Generic configurable interrupt chip implementation
config GENERIC_IRQ_CHIP
bool
@@ -82,18 +75,20 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
+ depends on SMP
select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
bool
+ depends on SMP
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
+# irqchip drivers should select this if they call iommu_dma_prepare_msi()
config IRQ_MSI_IOMMU
bool
@@ -106,6 +101,10 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
+# Snapshot for interrupt statistics
+config GENERIC_IRQ_STAT_SNAPSHOT
+ bool
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
@@ -135,6 +134,25 @@ config GENERIC_IRQ_DEBUGFS
If you don't know what to do here, say N.
+# Clear forwarded VM interrupts during kexec.
+# This option ensures the kernel clears active states for interrupts
+# forwarded to virtual machines (VMs) during a machine kexec.
+config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
+ bool
+
+config IRQ_KUNIT_TEST
+ bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ depends on SPARSE_IRQ
+ default KUNIT_ALL_TESTS
+ select IRQ_DOMAIN
+ imply SMP
+ help
+ This option enables KUnit tests for the IRQ subsystem API. These are
+ only for development and testing, not for regular kernel use cases.
+
+ If unsure, say N.
+
endmenu
config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..6ab3a4055667 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
CFLAGS_timings.o += -DDEBUG
@@ -15,6 +15,8 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
+obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f7ff8919dc9b..4013e6ad2b2f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,397 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (!cpumask_weight(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- cpus_read_lock();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- cpus_read_unlock();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -459,17 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
* have multiple sets, build each sets affinity mask separately.
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
- unsigned int this_vecs = affd->set_size[i];
- int ret;
+ unsigned int nr_masks, this_vecs = affd->set_size[i];
+ struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
- curvec += this_vecs;
- usedvecs += this_vecs;
+
+ for (int j = 0; j < nr_masks; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
+ curvec += nr_masks;
+ usedvecs += nr_masks;
}
/* Fill out vectors at the end that don't need affinity */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index ae60cae24e9a..d0af8a8b3ae6 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -43,18 +43,16 @@ unsigned long probe_irq_on(void)
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
- desc->irq_data.chip->irq_set_type(&desc->irq_data,
- IRQ_TYPE_PROBE);
+ desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE);
irq_activate_and_startup(desc, IRQ_NORESEND);
}
- raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
@@ -66,13 +64,12 @@ unsigned long probe_irq_on(void)
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -84,18 +81,16 @@ unsigned long probe_irq_on(void)
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
- } else
- if (i < 32)
- mask |= 1 << i;
+ } else if (i < 32) {
+ mask |= 1 << i;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
}
return mask;
@@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val)
int i;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
@@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
@@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val)
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
@@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c09324663088..678f094d261a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,101 +34,80 @@ struct irqaction chained_action = {
};
/**
- * irq_set_chip - set the irq chip for an irq
- * @irq: irq number
- * @chip: pointer to irq chip description structure
+ * irq_set_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
-
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
- irq_put_desc_unlock(desc, flags);
- /*
- * For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs.
- */
- irq_mark_irq(irq);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
+ ret = 0;
+ }
+ /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
+ if (!ret)
+ irq_mark_irq(irq);
+ return ret;
}
EXPORT_SYMBOL(irq_set_chip);
/**
- * irq_set_irq_type - set the irq trigger type for an irq
- * @irq: irq number
- * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ * irq_set_irq_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
-
- if (!desc)
- return -EINVAL;
-
- ret = __irq_set_trigger(desc, type);
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL)
+ return __irq_set_trigger(scoped_irqdesc, type);
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_type);
/**
- * irq_set_handler_data - set irq handler data for an irq
- * @irq: Interrupt number
- * @data: Pointer to interrupt specific data
+ * irq_set_handler_data - set irq handler data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
*
- * Set the hardware irq controller data for an irq
+ * Set the hardware irq controller data for an irq
*/
int irq_set_handler_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.handler_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_common_data.handler_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- * @irq_base: Interrupt number base
- * @irq_offset: Interrupt number offset
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq at offset
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
- struct msi_desc *entry)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.msi_desc = entry;
- if (entry && !irq_offset)
- entry->irq = irq_base;
- irq_put_desc_unlock(desc, flags);
- return 0;
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry)
+{
+ scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->irq_common_data.msi_desc = entry;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq
*/
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
@@ -136,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
}
/**
- * irq_set_chip_data - set irq chip data for an irq
- * @irq: Interrupt number
- * @data: Pointer to chip specific data
+ * irq_set_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
*
- * Set the hardware irq chip data for an irq
+ * Set the hardware irq chip data for an irq
*/
int irq_set_chip_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.chip_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_chip_data);
@@ -191,7 +167,8 @@ enum {
#ifdef CONFIG_SMP
static int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -200,7 +177,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(aff, cpu_online_mask)) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
@@ -225,14 +202,51 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
+
+void irq_startup_managed(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ /*
+ * Clear managed-shutdown flag, so we don't repeat managed-startup for
+ * multiple hotplugs, and cause imbalanced disable depth.
+ */
+ irqd_clr_managed_shutdown(d);
+
+ /*
+ * Only start it up when the disable depth is 1, so that a disable,
+ * hotunplug, hotplug sequence does not end up enabling it during
+ * hotplug unconditionally.
+ */
+ desc->depth--;
+ if (!desc->depth)
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+}
+
#else
static __always_inline int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
return IRQ_STARTUP_NORMAL;
}
#endif
+static void irq_enable(struct irq_desc *desc)
+{
+ if (!irqd_irq_disabled(&desc->irq_data)) {
+ unmask_irq(desc);
+ } else {
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable) {
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ } else {
+ unmask_irq(desc);
+ }
+ }
+}
+
static int __irq_startup(struct irq_desc *desc)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -255,7 +269,7 @@ static int __irq_startup(struct irq_desc *desc)
int irq_startup(struct irq_desc *desc, bool resend, bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
- struct cpumask *aff = irq_data_get_affinity_mask(d);
+ const struct cpumask *aff = irq_data_get_affinity_mask(d);
int ret = 0;
desc->depth = 0;
@@ -276,6 +290,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
ret = __irq_startup(desc);
break;
case IRQ_STARTUP_ABORT:
+ desc->depth = 1;
irqd_set_managed_shutdown(d);
return 0;
}
@@ -307,7 +322,14 @@ static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
- desc->depth = 1;
+ clear_irq_resend(desc);
+ /*
+ * Increment disable depth, so that a managed shutdown on
+ * CPU hotunplug preserves the actual disabled state when the
+ * CPU comes back online. See irq_startup_managed().
+ */
+ desc->depth++;
+
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
irq_state_set_disabled(desc);
@@ -332,21 +354,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc)
irq_domain_deactivate_irq(&desc->irq_data);
}
-void irq_enable(struct irq_desc *desc)
-{
- if (!irqd_irq_disabled(&desc->irq_data)) {
- unmask_irq(desc);
- } else {
- irq_state_clr_disabled(desc);
- if (desc->irq_data.chip->irq_enable) {
- desc->irq_data.chip->irq_enable(&desc->irq_data);
- irq_state_clr_masked(desc);
- } else {
- unmask_irq(desc);
- }
- }
-}
-
static void __irq_disable(struct irq_desc *desc, bool mask)
{
if (irqd_irq_disabled(&desc->irq_data)) {
@@ -450,67 +457,33 @@ void unmask_threaded_irq(struct irq_desc *desc)
unmask_irq(desc);
}
-/*
- * handle_nested_irq - Handle a nested irq from a irq thread
- * @irq: the interrupt number
- *
- * Handle interrupts which are nested into a threaded interrupt
- * handler. The handler function is called inside the calling
- * threads context.
- */
-void handle_nested_irq(unsigned int irq)
+/* Busy wait until INPROGRESS is cleared */
+static bool irq_wait_on_inprogress(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- irqreturn_t action_ret;
-
- might_sleep();
-
- raw_spin_lock_irq(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
- action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
}
-
- kstat_incr_irqs_this_cpu(desc);
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- raw_spin_unlock_irq(&desc->lock);
-
- action_ret = IRQ_NONE;
- for_each_action_of_desc(desc, action)
- action_ret |= action->thread_fn(action->irq, action->dev_id);
-
- if (!irq_settings_no_debug(desc))
- note_interrupt(desc, action_ret);
-
- raw_spin_lock_irq(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock_irq(&desc->lock);
+ return false;
}
-EXPORT_SYMBOL_GPL(handle_nested_irq);
-static bool irq_check_poll(struct irq_desc *desc)
+static bool irq_can_handle_pm(struct irq_desc *desc)
{
- if (!(desc->istate & IRQS_POLL_INPROGRESS))
- return false;
- return irq_wait_for_poll(desc);
-}
-
-static bool irq_may_run(struct irq_desc *desc)
-{
- unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+ struct irq_data *irqd = &desc->irq_data;
+ const struct cpumask *aff;
/*
* If the interrupt is not in progress and is not an armed
* wakeup interrupt, proceed.
*/
- if (!irqd_has_set(&desc->irq_data, mask))
+ if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED))
return true;
/*
@@ -518,86 +491,167 @@ static bool irq_may_run(struct irq_desc *desc)
* and suspended, disable it and notify the pm core about the
* event.
*/
- if (irq_pm_check_wakeup(desc))
+ if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) {
+ irq_pm_handle_wakeup(desc);
+ return false;
+ }
+
+ /* Check whether the interrupt is polled on another CPU */
+ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) {
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+ return irq_wait_on_inprogress(desc);
+ }
+
+ /* The below works only for single target interrupts */
+ if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) ||
+ !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq)
return false;
/*
- * Handle a potential concurrent poll on a different core.
+ * If the interrupt affinity was moved to this CPU and the
+ * interrupt is currently handled on the previous target CPU, then
+ * busy wait for INPROGRESS to be cleared. Otherwise for edge type
+ * interrupts the handler might get stuck on the previous target:
+ *
+ * CPU 0 CPU 1 (new target)
+ * handle_edge_irq()
+ * repeat:
+ * handle_event() handle_edge_irq()
+ * if (INPROGESS) {
+ * set(PENDING);
+ * mask();
+ * return;
+ * }
+ * if (PENDING) {
+ * clear(PENDING);
+ * unmask();
+ * goto repeat;
+ * }
+ *
+ * This happens when the device raises interrupts with a high rate
+ * and always before handle_event() completes and the CPU0 handler
+ * can clear INPROGRESS. This has been observed in virtual machines.
*/
- return irq_check_poll(desc);
+ aff = irq_data_get_effective_affinity_mask(irqd);
+ if (cpumask_first(aff) != smp_processor_id())
+ return false;
+ return irq_wait_on_inprogress(desc);
+}
+
+static inline bool irq_can_handle_actions(struct irq_desc *desc)
+{
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ return false;
+ }
+ return true;
+}
+
+static inline bool irq_can_handle(struct irq_desc *desc)
+{
+ if (!irq_can_handle_pm(desc))
+ return false;
+
+ return irq_can_handle_actions(desc);
}
/**
- * handle_simple_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
*
- * Simple interrupts are either sent from a demultiplexing interrupt
- * handler or come from hardware, where no interrupt hardware control
- * is necessary.
- *
- * Note: The caller is expected to handle the ack, clear, mask and
- * unmask issues if necessary.
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling threads
+ * context.
*/
-void handle_simple_irq(struct irq_desc *desc)
+void handle_nested_irq(unsigned int irq)
{
- raw_spin_lock(&desc->lock);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
- if (!irq_may_run(desc))
- goto out_unlock;
+ might_sleep();
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (!irq_can_handle_actions(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ action = desc->action;
+ kstat_incr_irqs_this_cpu(desc);
+ atomic_inc(&desc->threads_active);
}
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
+ if (!irq_settings_no_debug(desc))
+ note_interrupt(desc, action_ret);
+
+ wake_threads_waitq(desc);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control is
+ * necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and unmask
+ * issues if necessary.
+ */
+void handle_simple_irq(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc))
+ return;
+
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
/**
- * handle_untracked_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
*
- * Untracked interrupts are sent from a demultiplexing interrupt
- * handler when the demultiplexer does not know which device it its
- * multiplexed irq domain generated the interrupt. IRQ's handled
- * through here are not subjected to stats tracking, randomness, or
- * spurious interrupt detection.
+ * Untracked interrupts are sent from a demultiplexing interrupt handler
+ * when the demultiplexer does not know which device it its multiplexed irq
+ * domain generated the interrupt. IRQ's handled through here are not
+ * subjected to stats tracking, randomness, or spurious interrupt
+ * detection.
*
- * Note: Like handle_simple_irq, the caller is expected to handle
- * the ack, clear, mask and unmask issues if necessary.
+ * Note: Like handle_simple_irq, the caller is expected to handle the ack,
+ * clear, mask and unmask issues if necessary.
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock, &desc->lock) {
+ if (!irq_can_handle(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
- desc->istate &= ~IRQS_PENDING;
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- raw_spin_unlock(&desc->lock);
-
__handle_irq_event_percpu(desc);
- raw_spin_lock(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
EXPORT_SYMBOL_GPL(handle_untracked_irq);
@@ -620,40 +674,26 @@ static void cond_unmask_irq(struct irq_desc *desc)
}
/**
- * handle_level_irq - Level type irq handler
- * @desc: the interrupt description structure for this irq
+ * handle_level_irq - Level type irq handler
+ * @desc: the interrupt description structure for this irq
*
- * Level type interrupts are active as long as the hardware line has
- * the active level. This may require to mask the interrupt and unmask
- * it after the associated handler has acknowledged the device, so the
- * interrupt line is back to inactive.
+ * Level type interrupts are active as long as the hardware line has the
+ * active level. This may require to mask the interrupt and unmask it after
+ * the associated handler has acknowledged the device, so the interrupt
+ * line is back to inactive.
*/
void handle_level_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * keep it masked and get out of here
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
- }
+ if (!irq_can_handle(desc))
+ return;
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -678,34 +718,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
}
}
+static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data)
+{
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(data);
+}
+
/**
- * handle_fasteoi_irq - irq handler for transparent controllers
- * @desc: the interrupt description structure for this irq
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @desc: the interrupt description structure for this irq
*
- * Only a single callback will be issued to the chip: an ->eoi()
- * call when the interrupt has been serviced. This enables support
- * for modern forms of interrupt handlers, which handle the flow
- * details in hardware, transparently.
+ * Only a single callback will be issued to the chip: an ->eoi() call when
+ * the interrupt has been serviced. This enables support for modern forms
+ * of interrupt handlers, which handle the flow details in hardware,
+ * transparently.
*/
void handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ guard(raw_spinlock)(&desc->lock);
/*
- * If its disabled or no action available
- * then mask it and get out of here:
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
*/
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc)) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -716,12 +765,11 @@ void handle_fasteoi_irq(struct irq_desc *desc)
cond_unmask_eoi_irq(desc, chip);
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
@@ -759,40 +807,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
/**
- * handle_edge_irq - edge type IRQ handler
- * @desc: the interrupt description structure for this irq
+ * handle_edge_irq - edge type IRQ handler
+ * @desc: the interrupt description structure for this irq
*
- * Interrupt occurs on the falling and/or rising edge of a hardware
- * signal. The occurrence is latched into the irq controller hardware
- * and must be acked in order to be reenabled. After the ack another
- * interrupt can happen on the same source even before the first one
- * is handled by the associated event handler. If this happens it
- * might be necessary to disable (mask) the interrupt depending on the
- * controller hardware. This requires to reenable the interrupt inside
- * of the loop which handles the interrupts which have arrived while
- * the handler was running. If all pending interrupts are handled, the
- * loop is left.
+ * Interrupt occurs on the falling and/or rising edge of a hardware
+ * signal. The occurrence is latched into the irq controller hardware and
+ * must be acked in order to be reenabled. After the ack another interrupt
+ * can happen on the same source even before the first one is handled by
+ * the associated event handler. If this happens it might be necessary to
+ * disable (mask) the interrupt depending on the controller hardware. This
+ * requires to reenable the interrupt inside of the loop which handles the
+ * interrupts which have arrived while the handler was running. If all
+ * pending interrupts are handled, the loop is left.
*/
void handle_edge_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ guard(raw_spinlock)(&desc->lock);
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ if (!irq_can_handle(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
- goto out_unlock;
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -803,7 +838,7 @@ void handle_edge_irq(struct irq_desc *desc)
do {
if (unlikely(!desc->action)) {
mask_irq(desc);
- goto out_unlock;
+ return;
}
/*
@@ -819,61 +854,10 @@ void handle_edge_irq(struct irq_desc *desc)
handle_irq_event(desc);
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data));
}
EXPORT_SYMBOL(handle_edge_irq);
-#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
-/**
- * handle_edge_eoi_irq - edge eoi type IRQ handler
- * @desc: the interrupt description structure for this irq
- *
- * Similar as the above handle_edge_irq, but using eoi and w/o the
- * mask/unmask logic.
- */
-void handle_edge_eoi_irq(struct irq_desc *desc)
-{
- struct irq_chip *chip = irq_desc_get_chip(desc);
-
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- kstat_incr_irqs_this_cpu(desc);
-
- do {
- if (unlikely(!desc->action))
- goto out_eoi;
-
- handle_irq_event(desc);
-
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_eoi:
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
-}
-#endif
-
/**
* handle_percpu_irq - Per CPU local irq handler
* @desc: the interrupt description structure for this irq
@@ -913,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc)
void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- struct irqaction *action = desc->action;
unsigned int irq = irq_desc_get_irq(desc);
+ unsigned int cpu = smp_processor_id();
+ struct irqaction *action;
irqreturn_t res;
/*
@@ -926,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
+ for (action = desc->action; action; action = action->next)
+ if (cpumask_test_cpu(cpu, action->affinity))
+ break;
+
if (likely(action)) {
trace_irq_handler_entry(irq, action);
res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
trace_irq_handler_exit(irq, action, res);
} else {
- unsigned int cpu = smp_processor_id();
bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
if (enabled)
@@ -945,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
-/**
- * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
- * dev ids
- * @desc: the interrupt description structure for this irq
- *
- * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
- * as a percpu pointer.
- */
-void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
-{
- struct irq_chip *chip = irq_desc_get_chip(desc);
- struct irqaction *action = desc->action;
- unsigned int irq = irq_desc_get_irq(desc);
- irqreturn_t res;
-
- __kstat_incr_irqs_this_cpu(desc);
-
- trace_irq_handler_entry(irq, action);
- res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
- trace_irq_handler_exit(irq, action, res);
-
- if (chip->irq_eoi)
- chip->irq_eoi(&desc->irq_data);
-}
-
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
@@ -1009,8 +972,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
- if (is_chained)
+ if (is_chained) {
desc->action = NULL;
+ WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ }
desc->depth = 1;
}
desc->handle_irq = handle;
@@ -1036,44 +1001,33 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
+ WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
irq_activate_and_startup(desc, IRQ_RESEND);
}
}
-void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- __irq_do_set_handler(desc, handle, is_chained, name);
- irq_put_desc_busunlock(desc, flags);
+ scoped_irqdesc_get_and_buslock(irq, 0)
+ __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
-void
-irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
- void *data)
+void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- desc->irq_common_data.handler_data = data;
- __irq_do_set_handler(desc, handle, 1, NULL);
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- irq_put_desc_busunlock(desc, flags);
+ desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ }
}
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1083,40 +1037,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags, trigger, tmp;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- /*
- * Warn when a driver sets the no autoenable flag on an already
- * active interrupt.
- */
- WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
-
- irq_settings_clr_and_set(desc, clr, set);
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ unsigned long trigger, tmp;
+ /*
+ * Warn when a driver sets the no autoenable flag on an already
+ * active interrupt.
+ */
+ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
- trigger = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_clr_and_set(desc, clr, set);
- irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
- IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
- if (irq_settings_has_no_balance_set(desc))
- irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
- if (irq_settings_is_per_cpu(desc))
- irqd_set(&desc->irq_data, IRQD_PER_CPU);
- if (irq_settings_can_move_pcntxt(desc))
- irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
- if (irq_settings_is_level(desc))
- irqd_set(&desc->irq_data, IRQD_LEVEL);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
- tmp = irq_settings_get_trigger_mask(desc);
- if (tmp != IRQ_TYPE_NONE)
- trigger = tmp;
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, trigger);
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
- irq_put_desc_unlock(desc, flags);
+ irqd_set(&desc->irq_data, trigger);
+ }
}
EXPORT_SYMBOL_GPL(irq_modify_status);
@@ -1129,25 +1077,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status);
*/
void irq_cpu_online(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_online &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_online(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -1159,25 +1103,21 @@ void irq_cpu_online(void)
*/
void irq_cpu_offline(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_offline &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_offline(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
#endif
@@ -1186,102 +1126,69 @@ void irq_cpu_offline(void)
#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
/**
- * handle_fasteoi_ack_irq - irq handler for edge hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_ack() function called.
*/
void handle_fasteoi_ack_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out;
+ guard(raw_spinlock)(&desc->lock);
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ if (!irq_can_handle_pm(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (unlikely(!irq_can_handle_actions(desc))) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- /* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
/**
- * handle_fasteoi_mask_irq - irq handler for level hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_mask_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_mask_ack() function called.
*/
void handle_fasteoi_mask_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- mask_irq(desc);
- goto out;
+ if (!irq_can_handle(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- if (desc->istate & IRQS_ONESHOT)
- mask_irq(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
@@ -1332,6 +1239,43 @@ int irq_chip_get_parent_state(struct irq_data *data,
EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
/**
+ * irq_chip_shutdown_parent - Shutdown the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_shutdown() callback of the parent if available or falls
+ * back to irq_chip_disable_parent().
+ */
+void irq_chip_shutdown_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_shutdown)
+ parent->chip->irq_shutdown(parent);
+ else
+ irq_chip_disable_parent(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent);
+
+/**
+ * irq_chip_startup_parent - Startup the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_startup() callback of the parent if available or falls
+ * back to irq_chip_enable_parent().
+ */
+unsigned int irq_chip_startup_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_startup)
+ return parent->chip->irq_startup(parent);
+
+ irq_chip_enable_parent(data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_chip_startup_parent);
+
+/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
* @data: Pointer to interrupt specific data
@@ -1516,7 +1460,8 @@ int irq_chip_request_resources_parent(struct irq_data *data)
if (data->chip->irq_request_resources)
return data->chip->irq_request_resources(data);
- return -ENOSYS;
+ /* no error on missing optional irq_chip::irq_request_resources */
+ return 0;
}
EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
@@ -1558,6 +1503,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+static struct device *irq_get_pm_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->pm_dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1567,17 +1520,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- int retval;
+ struct device *dev = irq_get_pm_device(data);
+ int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
- if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
- return retval;
- }
- }
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_resume_and_get(dev);
- return 0;
+ return retval;
}
/**
@@ -1590,10 +1539,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 39a41c56ad4f..755346ea9819 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -37,7 +37,7 @@ static inline bool irq_needs_fixup(struct irq_data *d)
* has been removed from the online mask already.
*/
if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
- cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+ !cpumask_intersects(m, cpu_online_mask)) {
/*
* If this happens then there was a missed IRQ fixup at some
* point. Warn about it and enforce fixup.
@@ -70,6 +70,14 @@ static bool migrate_one_irq(struct irq_desc *desc)
}
/*
+ * Complete an eventually pending irq move cleanup. If this
+ * interrupt was moved in hard irq context, then the vectors need
+ * to be cleaned up. It can't wait until this interrupt actually
+ * happens and this CPU was involved.
+ */
+ irq_force_complete_move(desc);
+
+ /*
* No move required, if:
* - Interrupt is per cpu
* - Interrupt is not started
@@ -88,14 +96,6 @@ static bool migrate_one_irq(struct irq_desc *desc)
}
/*
- * Complete an eventually pending irq move cleanup. If this
- * interrupt was moved in hard irq context, then the vectors need
- * to be cleaned up. It can't wait until this interrupt actually
- * happens and this CPU was involved.
- */
- irq_force_complete_move(desc);
-
- /*
* If there is a setaffinity pending, then try to reuse the pending
* mask, so the last change of the affinity does not get lost. If
* there is no move pending or the pending mask does not contain
@@ -110,7 +110,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
if (maskchip && chip->irq_mask)
chip->irq_mask(d);
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(affinity, cpu_online_mask)) {
/*
* If the interrupt is managed, then shut it down and leave
* the affinity untouched.
@@ -130,6 +130,22 @@ static bool migrate_one_irq(struct irq_desc *desc)
* CPU.
*/
err = irq_do_set_affinity(d, affinity, false);
+
+ /*
+ * If there are online CPUs in the affinity mask, but they have no
+ * vectors left to make the migration work, try to break the
+ * affinity by migrating to any online CPU.
+ */
+ if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) {
+ pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n",
+ d->irq, cpumask_pr_args(affinity));
+
+ affinity = cpu_online_mask;
+ brokeaff = true;
+
+ err = irq_do_set_affinity(d, affinity, false);
+ }
+
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
@@ -161,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void)
bool affinity_broken;
desc = irq_to_desc(irq);
- raw_spin_lock(&desc->lock);
- affinity_broken = migrate_one_irq(desc);
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ affinity_broken = migrate_one_irq(desc);
if (affinity_broken) {
pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
@@ -176,10 +191,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
@@ -195,10 +210,8 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return;
- if (irqd_is_managed_and_shutdown(data)) {
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
- return;
- }
+ if (irqd_is_managed_and_shutdown(data))
+ irq_startup_managed(desc);
/*
* If the interrupt can only be directed to a single target
@@ -223,9 +236,8 @@ int irq_affinity_online_cpu(unsigned int cpu)
irq_lock_sparse();
for_each_active_irq(irq) {
desc = irq_to_desc(irq);
- raw_spin_lock_irq(&desc->lock);
- irq_restore_affinity_of_irq(desc, cpu);
- raw_spin_unlock_irq(&desc->lock);
+ scoped_guard(raw_spinlock_irq, &desc->lock)
+ irq_restore_affinity_of_irq(desc, cpu);
}
irq_unlock_sparse();
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..3527defd2890 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -9,14 +9,8 @@
static struct dentry *irq_dir;
-struct irq_bit_descr {
- unsigned int mask;
- char *name;
-};
-#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
-
-static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
- const struct irq_bit_descr *sd, int size)
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size)
{
int i;
@@ -30,7 +24,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
- struct cpumask *msk;
+ const struct cpumask *msk;
msk = irq_data_get_affinity_mask(data);
seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
@@ -58,6 +52,8 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
+ BIT_MASK_DESCR(IRQCHIP_MOVE_DEFERRED),
};
static void
@@ -69,8 +65,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
@@ -109,14 +109,12 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_NO_BALANCING),
BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
- BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
BIT_MASK_DESCR(IRQD_AFFINITY_SET),
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
- BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
@@ -128,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
};
static const struct irq_bit_descr irqdesc_states[] = {
@@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
struct irq_desc *desc = m->private;
struct irq_data *data;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %ps\n", desc->handle_irq);
seq_printf(m, "device: %s\n", desc->dev_name);
@@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p)
seq_printf(m, "node: %d\n", irq_data_get_node(data));
irq_debug_show_masks(m, desc);
irq_debug_show_data(m, data, 0);
- raw_spin_unlock_irq(&desc->lock);
return 0;
}
@@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev)
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
- char name [10];
+ char name [12];
if (!irq_dir || !desc || desc->debugfs_file)
return;
- sprintf(name, "%d", irq);
+ sprintf(name, "%u", irq);
desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f6e5515ee077..b41188698622 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/device.h>
#include <linux/gfp.h>
#include <linux/irq.h>
@@ -29,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
return this->irq == match->irq && this->dev_id == match->dev_id;
}
-/**
- * devm_request_threaded_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
- *
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_threaded_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
- *
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
- */
-int devm_request_threaded_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, irq_handler_t thread_fn,
- unsigned long irqflags, const char *devname,
- void *dev_id)
+static int devm_request_result(struct device *dev, int rc, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ const char *devname)
+{
+ if (rc >= 0)
+ return rc;
+
+ return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n",
+ irq, handler, thread_fn, devname ? : "");
+}
+
+static int __devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ irq_handler_t thread_fn,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -77,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_threaded_irq);
/**
- * devm_request_any_context_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @thread_fn: Function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
*
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_any_context_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_threaded_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
*
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
+ * Return: 0 on success or a negative error number.
*/
-int devm_request_any_context_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
+{
+ int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn,
+ irqflags, devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, thread_fn, devname);
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+static int __devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -123,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
return rc;
}
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_any_context_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error
+ * number.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags,
+ devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, NULL, devname);
+}
EXPORT_SYMBOL(devm_request_any_context_irq);
/**
@@ -140,9 +188,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
struct irq_devres match_data = { irq, dev_id };
- WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
+ WARN_ON(devres_release(dev, devm_irq_release, devm_irq_match,
&match_data));
- free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);
@@ -282,3 +329,43 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
}
EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);
#endif /* CONFIG_GENERIC_IRQ_CHIP */
+
+#ifdef CONFIG_IRQ_DOMAIN
+static void devm_irq_domain_remove(struct device *dev, void *res)
+{
+ struct irq_domain **domain = res;
+
+ irq_domain_remove(*domain);
+}
+
+/**
+ * devm_irq_domain_instantiate() - Instantiate a new irq domain data for a
+ * managed device.
+ * @dev: Device to instantiate the domain for
+ * @info: Domain information pointer pointing to the information for this
+ * domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *devm_irq_domain_instantiate(struct device *dev,
+ const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ struct irq_domain **dr;
+
+ dr = devres_alloc(devm_irq_domain_remove, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return ERR_PTR(-ENOMEM);
+
+ domain = irq_domain_instantiate(info);
+ if (!IS_ERR(domain)) {
+ *dr = domain;
+ devres_add(dev, dr);
+ } else {
+ devres_free(dr);
+ }
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_instantiate);
+#endif /* CONFIG_IRQ_DOMAIN */
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f0862eb6b506..3cd0c40282c0 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
@@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache |= mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
@@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = ~d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
/**
@@ -156,12 +150,12 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set);
/**
* irq_gc_eoi - EOI interrupt
@@ -173,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.eoi);
- irq_gc_unlock(gc);
}
/**
@@ -195,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
if (!(mask & gc->wake_enabled))
return -EINVAL;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
if (on)
gc->wake_active |= mask;
else
gc->wake_active &= ~mask;
- irq_gc_unlock(gc);
return 0;
}
EXPORT_SYMBOL_GPL(irq_gc_set_wake);
@@ -219,11 +211,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
+ struct irq_chip_type *ct = gc->chip_types;
+ int i;
+
raw_spin_lock_init(&gc->lock);
gc->num_ct = num_ct;
gc->irq_base = irq_base;
gc->reg_base = reg_base;
- gc->chip_types->chip.name = name;
+ for (i = 0; i < num_ct; i++)
+ ct[i].chip.name = name;
gc->chip_types->handler = handler;
}
@@ -272,51 +268,45 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
- * @d: irq domain for which to allocate chips
- * @irqs_per_chip: Number of interrupts each chip handles (max 32)
- * @num_ct: Number of irq_chip_type instances associated with this
- * @name: Name of the irq chip
- * @handler: Default flow handler associated with these chips
- * @clr: IRQ_* bits to clear in the mapping function
- * @set: IRQ_* bits to set in the mapping function
- * @gcflags: Generic chip specific setup flags
+ * irq_domain_alloc_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @info: Generic chip information
+ *
+ * Return: 0 on success, negative error code on failure
*/
-int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
- int num_ct, const char *name,
- irq_flow_handler_t handler,
- unsigned int clr, unsigned int set,
- enum irq_gc_flags gcflags)
+int irq_domain_alloc_generic_chips(struct irq_domain *d,
+ const struct irq_domain_chip_generic_info *info)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- unsigned long flags;
int numchips, i;
size_t dgc_sz;
size_t gc_sz;
size_t sz;
void *tmp;
+ int ret;
if (d->gc)
return -EBUSY;
- numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+ numchips = DIV_ROUND_UP(d->revmap_size, info->irqs_per_chip);
if (!numchips)
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- gc_sz = struct_size(gc, chip_types, num_ct);
+ gc_sz = struct_size(gc, chip_types, info->num_ct);
dgc_sz = struct_size(dgc, gc, numchips);
sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
return -ENOMEM;
- dgc->irqs_per_chip = irqs_per_chip;
+ dgc->irqs_per_chip = info->irqs_per_chip;
dgc->num_chips = numchips;
- dgc->irq_flags_to_set = set;
- dgc->irq_flags_to_clear = clr;
- dgc->gc_flags = gcflags;
+ dgc->irq_flags_to_set = info->irq_flags_to_set;
+ dgc->irq_flags_to_clear = info->irq_flags_to_clear;
+ dgc->gc_flags = info->gc_flags;
+ dgc->exit = info->exit;
d->gc = dgc;
/* Calc pointer to the first generic chip */
@@ -324,22 +314,91 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
- irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
- NULL, handler);
+ irq_init_generic_chip(gc, info->name, info->num_ct,
+ i * dgc->irqs_per_chip, NULL,
+ info->handler);
gc->domain = d;
- if (gcflags & IRQ_GC_BE_IO) {
+ if (dgc->gc_flags & IRQ_GC_BE_IO) {
gc->reg_readl = &irq_readl_be;
gc->reg_writel = &irq_writel_be;
}
- raw_spin_lock_irqsave(&gc_lock, flags);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock_irqrestore(&gc_lock, flags);
+ if (info->init) {
+ ret = info->init(gc);
+ if (ret)
+ goto err;
+ }
+
+ scoped_guard (raw_spinlock_irqsave, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
/* Calc pointer to the next generic chip */
tmp += gc_sz;
}
return 0;
+
+err:
+ while (i--) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_generic_chips);
+
+/**
+ * irq_domain_remove_generic_chips - Remove generic chips from an irq domain
+ * @d: irq domain for which generic chips are to be removed
+ */
+void irq_domain_remove_generic_chips(struct irq_domain *d)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int i;
+
+ if (!dgc)
+ return;
+
+ for (i = 0; i < dgc->num_chips; i++) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+}
+EXPORT_SYMBOL_GPL(irq_domain_remove_generic_chips);
+
+/**
+ * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @irqs_per_chip: Number of interrupts each chip handles (max 32)
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @name: Name of the irq chip
+ * @handler: Default flow handler associated with these chips
+ * @clr: IRQ_* bits to clear in the mapping function
+ * @set: IRQ_* bits to set in the mapping function
+ * @gcflags: Generic chip specific setup flags
+ */
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
+{
+ struct irq_domain_chip_generic_info info = {
+ .irqs_per_chip = irqs_per_chip,
+ .num_ct = num_ct,
+ .name = name,
+ .handler = handler,
+ .irq_flags_to_clear = clr,
+ .irq_flags_to_set = set,
+ .gc_flags = gcflags,
+ };
+
+ return irq_domain_alloc_generic_chips(d, &info);
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
@@ -389,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
struct irq_chip *chip;
- unsigned long flags;
int idx;
gc = __irq_get_domain_generic_chip(d, hw_irq);
@@ -409,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
/* We only init the cache for the first mapping of a generic chip */
if (!gc->installed) {
- raw_spin_lock_irqsave(&gc->lock, flags);
+ guard(raw_spinlock_irqsave)(&gc->lock);
irq_gc_init_mask_cache(gc, dgc->gc_flags);
- raw_spin_unlock_irqrestore(&gc->lock, flags);
}
/* Mark the interrupt as installed */
@@ -431,7 +488,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
return 0;
}
-static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
{
struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -478,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
struct irq_chip *chip = &ct->chip;
unsigned int i;
- raw_spin_lock(&gc_lock);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
irq_gc_init_mask_cache(gc, flags);
@@ -544,21 +600,33 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int i, virq;
- raw_spin_lock(&gc_lock);
- list_del(&gc->list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_del(&gc->list);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ /*
+ * Interrupt domain based chips store the base hardware
+ * interrupt number in gc::irq_base. Otherwise gc::irq_base
+ * contains the base Linux interrupt number.
+ */
+ if (gc->domain) {
+ virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+ if (!virq)
+ continue;
+ } else {
+ virq = gc->irq_base + i;
+ }
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(virq, NULL);
+ irq_set_chip(virq, &no_irq_chip);
+ irq_set_chip_data(virq, NULL);
+ irq_modify_status(virq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
@@ -582,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
}
#ifdef CONFIG_PM
-static int irq_gc_suspend(void)
+static int irq_gc_suspend(void *data)
{
struct irq_chip_generic *gc;
@@ -602,7 +670,7 @@ static int irq_gc_suspend(void)
return 0;
}
-static void irq_gc_resume(void)
+static void irq_gc_resume(void *data)
{
struct irq_chip_generic *gc;
@@ -625,7 +693,7 @@ static void irq_gc_resume(void)
#define irq_gc_resume NULL
#endif
-static void irq_gc_shutdown(void)
+static void irq_gc_shutdown(void *data)
{
struct irq_chip_generic *gc;
@@ -641,15 +709,19 @@ static void irq_gc_shutdown(void)
}
}
-static struct syscore_ops irq_gc_syscore_ops = {
+static const struct syscore_ops irq_gc_syscore_ops = {
.suspend = irq_gc_suspend,
.resume = irq_gc_resume,
.shutdown = irq_gc_shutdown,
};
+static struct syscore irq_gc_syscore = {
+ .ops = &irq_gc_syscore_ops,
+};
+
static int __init irq_gc_init_ops(void)
{
- register_syscore_ops(&irq_gc_syscore_ops);
+ register_syscore(&irq_gc_syscore);
return 0;
}
device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9489f93b3db3..786f5570a640 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,53 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
*/
atomic_inc(&desc->threads_active);
- wake_up_process(action->thread);
+ /*
+ * This might be a premature wakeup before the thread reached the
+ * thread function and set the IRQTF_READY bit. It's waiting in
+ * kthread code with state UNINTERRUPTIBLE. Once it reaches the
+ * thread function it waits with INTERRUPTIBLE. The wakeup is not
+ * lost in that case because the thread is guaranteed to observe
+ * the RUN flag before it goes to sleep in wait_for_interrupt().
+ */
+ wake_up_state(action->thread, TASK_INTERRUPTIBLE);
+}
+
+static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
+static u64 irqhandler_duration_threshold_ns __ro_after_init;
+
+static int __init irqhandler_duration_check_setup(char *arg)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(arg, 0, &val);
+ if (ret) {
+ pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret);
+ return 0;
+ }
+
+ if (!val) {
+ pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n");
+ return 0;
+ }
+
+ irqhandler_duration_threshold_ns = val * 1000;
+ static_branch_enable(&irqhandler_duration_check_enabled);
+
+ return 1;
+}
+__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup);
+
+static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq,
+ const struct irqaction *action)
+{
+ u64 delta_ns = local_clock() - ts_start;
+
+ if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) {
+ pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n",
+ smp_processor_id(), irq, action->handler,
+ div_u64(delta_ns, NSEC_PER_USEC));
+ }
}
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
@@ -155,7 +201,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
lockdep_hardirq_threaded();
trace_irq_handler_entry(irq, action);
- res = action->handler(irq, action->dev_id);
+
+ if (static_branch_unlikely(&irqhandler_duration_check_enabled)) {
+ u64 ts_start = local_clock();
+
+ res = action->handler(irq, action->dev_id);
+ irqhandler_duration_check(ts_start, irq, action);
+ } else {
+ res = action->handler(irq, action->dev_id);
+ }
+
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99cbdf55a8bd..0164ca48da59 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,14 +12,15 @@
#include <linux/sched/clock.h>
#ifdef CONFIG_SPARSE_IRQ
-# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+# define MAX_SPARSE_IRQS INT_MAX
#else
-# define IRQ_BITMAP_BITS NR_IRQS
+# define MAX_SPARSE_IRQS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
extern bool noirqdebug;
+extern int irq_poll_cpu;
extern struct irqaction chained_action;
@@ -29,12 +30,14 @@ extern struct irqaction chained_action;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
@@ -45,11 +48,15 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -62,6 +69,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
@@ -80,10 +88,10 @@ extern void __enable_irq(struct irq_desc *desc);
extern int irq_activate(struct irq_desc *desc);
extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
+extern void irq_startup_managed(struct irq_desc *desc);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
-extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
@@ -97,21 +105,18 @@ static inline void irq_mark_irq(unsigned int irq) { }
extern void irq_mark_irq(unsigned int irq);
#endif
-extern int __irq_get_irqchip_state(struct irq_data *data,
- enum irqchip_irq_state which,
- bool *state);
-
-extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
int check_irq_resend(struct irq_desc *desc, bool inject);
-bool irq_wait_for_poll(struct irq_desc *desc);
+void clear_irq_resend(struct irq_desc *desc);
+void irq_resend_init(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+void wake_threads_waitq(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -128,8 +133,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
-extern void irq_set_thread_affinity(struct irq_desc *desc);
-
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
@@ -139,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc);
static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
#endif
+
+#define for_each_action_of_desc(desc, act) \
+ for (act = desc->action; act; act = act->next)
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -158,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
-#define for_each_action_of_desc(desc, act) \
- for (act = desc->action; act; act = act->next)
-
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check);
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
-static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, true, check);
-}
+__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true);
+__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc,
+ __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
+ unsigned long flags; bool bus);
-static inline void
-irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus,
+ unsigned int check)
{
- __irq_put_desc_unlock(desc, flags, true);
-}
+ class_irqdesc_lock_t _t = { .bus = bus, };
-static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, false, check);
-}
+ _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check);
-static inline void
-irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
-{
- __irq_put_desc_unlock(desc, flags, false);
+ return _t;
}
+#define scoped_irqdesc_get_and_lock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, false, _check)
+
+#define scoped_irqdesc_get_and_buslock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, true, _check)
+
+#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope)))
+
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
static inline unsigned int irqd_get(struct irq_data *d)
@@ -249,7 +251,7 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
- __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
}
@@ -269,12 +271,17 @@ static inline int irq_desc_is_chained(struct irq_desc *desc)
return (desc->action && desc->action == &chained_action);
}
+static inline bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
#ifdef CONFIG_PM_SLEEP
-bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_handle_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
-static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
@@ -405,7 +412,7 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
#ifdef CONFIG_GENERIC_PENDING_IRQ
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
- return irqd_can_move_in_process_context(data);
+ return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED);
}
static inline bool irq_move_pending(struct irq_data *data)
{
@@ -425,11 +432,8 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
return desc->pending_mask;
}
-static inline bool handle_enforce_irqctx(struct irq_data *data)
-{
- return irqd_is_handle_enforce_irqctx(data);
-}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
+void irq_force_complete_move(struct irq_desc *desc);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
@@ -455,10 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
-static inline bool handle_enforce_irqctx(struct irq_data *data)
-{
- return false;
-}
+static inline void irq_force_complete_move(struct irq_desc *desc) { }
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
@@ -485,6 +486,16 @@ static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
+struct irq_bit_descr {
+ unsigned int mask;
+ char *name;
+};
+
+#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
+
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size);
+
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 08ce7da3b57c..961d4af76af3 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -115,11 +115,11 @@ free_descs:
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
struct irq_domain *domain;
unsigned int nr_irqs;
- if (!irq || !data || !ipimask)
+ if (!irq || !data)
return -EINVAL;
domain = data->domain;
@@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
return -EINVAL;
}
- if (WARN_ON(!cpumask_subset(dest, ipimask)))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
/*
* Must be destroying a subset of CPUs to which this IPI
* was set up to target
@@ -162,12 +163,13 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
- if (!data || !ipimask || cpu >= nr_cpu_ids)
+ if (!data || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
- if (!cpumask_test_cpu(cpu, ipimask))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
return INVALID_HWIRQ;
/*
@@ -186,9 +188,9 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
- struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+ const struct cpumask *ipimask;
- if (!chip || !ipimask)
+ if (!chip || !data)
return -EINVAL;
if (!chip->ipi_send_single && !chip->ipi_send_mask)
@@ -197,6 +199,10 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (cpu >= nr_cpu_ids)
return -EINVAL;
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask)
+ return -EINVAL;
+
if (dest) {
if (!cpumask_subset(dest, ipimask))
return -EINVAL;
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 0cd02efa3a74..ae4c9cbd1b4b 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -4,22 +4,23 @@
* Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com>
*/
+#include <linux/cleanup.h>
+#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irq_sim.h>
#include <linux/irq_work.h>
-#include <linux/interrupt.h>
#include <linux/slab.h>
struct irq_sim_work_ctx {
struct irq_work work;
- int irq_base;
unsigned int irq_count;
unsigned long *pending;
struct irq_domain *domain;
+ struct irq_sim_ops ops;
+ void *user_data;
};
struct irq_sim_irq_ctx {
- int irqnum;
bool enabled;
struct irq_sim_work_ctx *work_ctx;
};
@@ -88,6 +89,31 @@ static int irq_sim_set_irqchip_state(struct irq_data *data,
return 0;
}
+static int irq_sim_request_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_requested)
+ return work_ctx->ops.irq_sim_irq_requested(work_ctx->domain,
+ hwirq,
+ work_ctx->user_data);
+
+ return 0;
+}
+
+static void irq_sim_release_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_released)
+ work_ctx->ops.irq_sim_irq_released(work_ctx->domain, hwirq,
+ work_ctx->user_data);
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
@@ -95,6 +121,8 @@ static struct irq_chip irq_sim_irqchip = {
.irq_set_type = irq_sim_set_type,
.irq_get_irqchip_state = irq_sim_get_irqchip_state,
.irq_set_irqchip_state = irq_sim_set_irqchip_state,
+ .irq_request_resources = irq_sim_request_resources,
+ .irq_release_resources = irq_sim_release_resources,
};
static void irq_sim_handle_irq(struct irq_work *work)
@@ -164,35 +192,42 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
- struct irq_sim_work_ctx *work_ctx;
+ return irq_domain_create_sim_full(fwnode, num_irqs, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+
+struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
+ struct irq_sim_work_ctx *work_ctx __free(kfree) =
+ kzalloc(sizeof(*work_ctx), GFP_KERNEL);
- work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL);
if (!work_ctx)
- goto err_out;
+ return ERR_PTR(-ENOMEM);
- work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
- if (!work_ctx->pending)
- goto err_free_work_ctx;
+ unsigned long *pending __free(bitmap) = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!pending)
+ return ERR_PTR(-ENOMEM);
work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs,
&irq_sim_domain_ops,
work_ctx);
if (!work_ctx->domain)
- goto err_free_bitmap;
+ return ERR_PTR(-ENOMEM);
work_ctx->irq_count = num_irqs;
- init_irq_work(&work_ctx->work, irq_sim_handle_irq);
+ work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
+ work_ctx->pending = no_free_ptr(pending);
+ work_ctx->user_data = data;
- return work_ctx->domain;
+ if (ops)
+ memcpy(&work_ctx->ops, ops, sizeof(*ops));
-err_free_bitmap:
- bitmap_free(work_ctx->pending);
-err_free_work_ctx:
- kfree(work_ctx);
-err_out:
- return ERR_PTR(-ENOMEM);
+ return no_free_ptr(work_ctx)->domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(irq_domain_create_sim_full);
/**
* irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
@@ -234,10 +269,22 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
+ return devm_irq_domain_create_sim_full(dev, fwnode, num_irqs,
+ NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+
+struct irq_domain *
+devm_irq_domain_create_sim_full(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
struct irq_domain *domain;
int ret;
- domain = irq_domain_create_sim(fwnode, num_irqs);
+ domain = irq_domain_create_sim_full(fwnode, num_irqs, ops, data);
if (IS_ERR(domain))
return domain;
@@ -247,4 +294,4 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
return domain;
}
-EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim_full);
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
new file mode 100644
index 000000000000..e2d31914b3c4
--- /dev/null
+++ b/kernel/irq/irq_test.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqdomain.h>
+#include <linux/nodemask.h>
+#include <kunit/test.h>
+
+#include "internals.h"
+
+static irqreturn_t noop_handler(int irq, void *data)
+{
+ return IRQ_HANDLED;
+}
+
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data) { return 0; }
+
+static int noop_affinity(struct irq_data *data, const struct cpumask *dest,
+ bool force)
+{
+ irq_data_update_effective_affinity(data, dest);
+
+ return 0;
+}
+
+static struct irq_chip fake_irq_chip = {
+ .name = "fake",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ .irq_set_affinity = noop_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd)
+{
+ struct irq_desc *desc;
+ int virq;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */
+ irq_settings_clr_norequest(desc);
+
+ return virq;
+}
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_free_disabled_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ free_irq(virq, NULL);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_shutdown_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ .mask = CPU_MASK_ALL,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
+
+ virq = irq_test_setup_fake_irq(test, &affinity);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_shutdown_and_deactivate(desc);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+
+ KUNIT_EXPECT_EQ(test, irq_activate(desc), 0);
+#ifdef CONFIG_SMP
+ irq_startup_managed(desc);
+#endif
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_cpuhotplug_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for CPU hotplug");
+ if (!get_cpu_device(1))
+ kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
+ if (!cpu_is_hotpluggable(1))
+ kunit_skip(test, "CPU 1 must be hotpluggable");
+ if (!cpu_online(1))
+ kunit_skip(test, "CPU 1 must be online");
+
+ cpumask_copy(&affinity.mask, cpumask_of(1));
+
+ virq = irq_test_setup_fake_irq(test, &affinity);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+ KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static struct kunit_case irq_test_cases[] = {
+ KUNIT_CASE(irq_disable_depth_test),
+ KUNIT_CASE(irq_free_disabled_test),
+ KUNIT_CASE(irq_shutdown_depth_test),
+ KUNIT_CASE(irq_cpuhotplug_test),
+ {}
+};
+
+static struct kunit_suite irq_test_suite = {
+ .name = "irq_test_cases",
+ .test_cases = irq_test_cases,
+};
+
+kunit_test_suite(irq_test_suite);
+MODULE_DESCRIPTION("IRQ unit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..6acf268f005b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -12,10 +12,10 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/radix-tree.h>
-#include <linux/bitmap.h>
+#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
#include <linux/sysfs.h>
+#include <linux/string_choices.h>
#include "internals.h"
@@ -93,11 +93,23 @@ static void desc_smp_init(struct irq_desc *desc, int node,
#endif
}
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->irq_common_data.affinity);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
+}
+
#else
static inline int
alloc_masks(struct irq_desc *desc, int node) { return 0; }
static inline void
desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
+static inline void free_masks(struct irq_desc *desc) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -123,15 +135,106 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
- *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
}
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
+static unsigned int nr_irqs = NR_IRQS;
+
+/**
+ * irq_get_nr_irqs() - Number of interrupts supported by the system.
+ */
+unsigned int irq_get_nr_irqs(void)
+{
+ return nr_irqs;
+}
+EXPORT_SYMBOL_GPL(irq_get_nr_irqs);
+
+/**
+ * irq_set_nr_irqs() - Set the number of interrupts supported by the system.
+ * @nr: New number of interrupts.
+ *
+ * Return: @nr.
+ */
+unsigned int irq_set_nr_irqs(unsigned int nr)
+{
+ nr_irqs = nr;
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(irq_set_nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
+ MT_FLAGS_ALLOC_RANGE |
+ MT_FLAGS_LOCK_EXTERN |
+ MT_FLAGS_USE_RCU,
+ sparse_irq_lock);
+
+static int irq_find_free_area(unsigned int from, unsigned int cnt)
+{
+ MA_STATE(mas, &sparse_irqs, 0, 0);
+
+ if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
+ return -ENOSPC;
+ return mas.index;
+}
+
+static unsigned int irq_find_at_or_after(unsigned int offset)
+{
+ unsigned long index = offset;
+ struct irq_desc *desc;
+
+ guard(rcu)();
+ desc = mt_find(&sparse_irqs, &index, nr_irqs);
+
+ return desc ? irq_desc_get_irq(desc) : nr_irqs;
+}
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ mas_erase(&mas);
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static const struct kobj_type irq_kobj_type;
+#endif
+
+static int init_desc(struct irq_desc *desc, int irq, int node,
+ unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
+{
+ desc->kstat_irqs = alloc_percpu(struct irqstat);
+ if (!desc->kstat_irqs)
+ return -ENOMEM;
+
+ if (alloc_masks(desc, node)) {
+ free_percpu(desc->kstat_irqs);
+ return -ENOMEM;
+ }
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
+ init_waitqueue_head(&desc->wait_for_threads);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
+ irq_resend_init(desc);
+#ifdef CONFIG_SPARSE_IRQ
+ kobject_init(&desc->kobj, &irq_kobj_type);
+ init_rcu_head(&desc->rcu);
+#endif
+
+ return 0;
+}
#ifdef CONFIG_SPARSE_IRQ
@@ -143,8 +246,7 @@ static struct kobject *irq_kobj_base;
#define IRQ_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-static ssize_t per_cpu_count_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
ssize_t ret = 0;
@@ -154,113 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
for_each_possible_cpu(cpu) {
unsigned int c = irq_desc_kstat_cpu(desc, cpu);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+ ret += sysfs_emit_at(buf, ret, "%s%u", p, c);
p = ",";
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(per_cpu_count);
-static ssize_t chip_name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
- if (desc->irq_data.chip && desc->irq_data.chip->name) {
- ret = scnprintf(buf, PAGE_SIZE, "%s\n",
- desc->irq_data.chip->name);
- }
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name)
+ return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name);
+ return 0;
}
IRQ_ATTR_RO(chip_name);
-static ssize_t hwirq_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq);
+ return 0;
}
IRQ_ATTR_RO(hwirq);
-static ssize_t type_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n",
- irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
- raw_spin_unlock_irq(&desc->lock);
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
}
IRQ_ATTR_RO(type);
-static ssize_t wakeup_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n",
- irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled");
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
}
IRQ_ATTR_RO(wakeup);
-static ssize_t name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->name)
- ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%s\n", desc->name);
+ return 0;
}
IRQ_ATTR_RO(name);
-static ssize_t actions_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
struct irqaction *action;
ssize_t ret = 0;
char *p = "";
- raw_spin_lock_irq(&desc->lock);
- for (action = desc->action; action != NULL; action = action->next) {
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- p, action->name);
- p = ",";
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ for_each_action_of_desc(desc, action) {
+ ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name);
+ p = ",";
+ }
}
- raw_spin_unlock_irq(&desc->lock);
if (ret)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
-
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(actions);
@@ -277,7 +349,7 @@ static struct attribute *irq_attrs[] = {
};
ATTRIBUTE_GROUPS(irq);
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = irq_groups,
@@ -288,22 +360,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
@@ -313,26 +388,21 @@ static int __init irq_sysfs_init(void)
int irq;
/* Prevent concurrent irq alloc/free */
- irq_lock_sparse();
-
+ guard(mutex)(&sparse_irq_lock);
irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
- if (!irq_kobj_base) {
- irq_unlock_sparse();
+ if (!irq_kobj_base)
return -ENOMEM;
- }
/* Add the already allocated interrupts */
for_each_irq_desc(irq, desc)
irq_sysfs_add(irq, desc);
- irq_unlock_sparse();
-
return 0;
}
postcore_initcall(irq_sysfs_init);
#else /* !CONFIG_SYSFS */
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
};
@@ -341,41 +411,14 @@ static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
-static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
-
-static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
-{
- radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return radix_tree_lookup(&irq_desc_tree, irq);
+ return mtree_load(&sparse_irqs, irq);
}
#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
-static void delete_irq_desc(unsigned int irq)
-{
- radix_tree_delete(&irq_desc_tree, irq);
-}
-
-#ifdef CONFIG_SMP
-static void free_masks(struct irq_desc *desc)
-{
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- free_cpumask_var(desc->pending_mask);
-#endif
- free_cpumask_var(desc->irq_common_data.affinity);
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- free_cpumask_var(desc->irq_common_data.effective_affinity);
-#endif
-}
-#else
-static inline void free_masks(struct irq_desc *desc) { }
-#endif
-
void irq_lock_sparse(void)
{
mutex_lock(&sparse_irq_lock);
@@ -391,34 +434,19 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
struct module *owner)
{
struct irq_desc *desc;
+ int ret;
desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
if (!desc)
return NULL;
- /* allocate based on nr_cpu_ids */
- desc->kstat_irqs = alloc_percpu(unsigned int);
- if (!desc->kstat_irqs)
- goto err_desc;
-
- if (alloc_masks(desc, node))
- goto err_kstat;
-
- raw_spin_lock_init(&desc->lock);
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- mutex_init(&desc->request_mutex);
- init_rcu_head(&desc->rcu);
- desc_set_defaults(irq, desc, node, affinity, owner);
- irqd_set(&desc->irq_data, flags);
- kobject_init(&desc->kobj, &irq_kobj_type);
+ ret = init_desc(desc, irq, node, flags, affinity, owner);
+ if (unlikely(ret)) {
+ kfree(desc);
+ return NULL;
+ }
return desc;
-
-err_kstat:
- free_percpu(desc->kstat_irqs);
-err_desc:
- kfree(desc);
- return NULL;
}
static void irq_kobj_release(struct kobject *kobj)
@@ -489,6 +517,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
flags = IRQD_AFFINITY_MANAGED |
IRQD_MANAGED_SHUTDOWN;
}
+ flags |= IRQD_AFFINITY_SET;
mask = &affinity->mask;
node = cpu_to_node(cpumask_first(mask));
affinity++;
@@ -501,7 +530,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
irq_sysfs_add(start + i, desc);
irq_add_debugfs_entry(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
err:
@@ -510,12 +538,12 @@ err:
return -ENOMEM;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static bool irq_expand_nr_irqs(unsigned int nr)
{
- if (nr > IRQ_BITMAP_BITS)
- return -ENOMEM;
+ if (nr > MAX_SPARSE_IRQS)
+ return false;
nr_irqs = nr;
- return 0;
+ return true;
}
int __init early_irq_init(void)
@@ -530,18 +558,17 @@ int __init early_irq_init(void)
printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
NR_IRQS, nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
- nr_irqs = IRQ_BITMAP_BITS;
+ if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
+ nr_irqs = MAX_SPARSE_IRQS;
- if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
- initcnt = IRQ_BITMAP_BITS;
+ if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
+ initcnt = MAX_SPARSE_IRQS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, 0, NULL, NULL);
- set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
@@ -560,24 +587,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
- struct irq_desc *desc;
+ int ret;
init_irq_default_affinity();
printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
- desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
- desc[i].kstat_irqs = alloc_percpu(unsigned int);
- alloc_masks(&desc[i], node);
- raw_spin_lock_init(&desc[i].lock);
- lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- mutex_init(&desc[i].request_mutex);
- desc_set_defaults(i, &desc[i], node, NULL, NULL);
+ ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL);
+ if (unlikely(ret))
+ goto __free_desc_res;
}
+
return arch_early_irq_init();
+
+__free_desc_res:
+ while (--i >= 0) {
+ free_masks(irq_desc + i);
+ free_percpu(irq_desc[i].kstat_irqs);
+ }
+
+ return ret;
}
struct irq_desc *irq_to_desc(unsigned int irq)
@@ -589,11 +621,10 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
+ delete_irq_desc(irq);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -606,29 +637,21 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
+ irq_insert_desc(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static inline bool irq_expand_nr_irqs(unsigned int nr)
{
- return -ENOMEM;
+ return false;
}
void irq_mark_irq(unsigned int irq)
{
- mutex_lock(&sparse_irq_lock);
- bitmap_set(allocated_irqs, irq, 1);
- mutex_unlock(&sparse_irq_lock);
-}
-
-#ifdef CONFIG_GENERIC_IRQ_LEGACY
-void irq_init_desc(unsigned int irq)
-{
- free_desc(irq);
+ guard(mutex)(&sparse_irq_lock);
+ irq_insert_desc(irq, irq_desc + irq);
}
-#endif
#endif /* !CONFIG_SPARSE_IRQ */
@@ -640,7 +663,7 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
@@ -662,6 +685,29 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
@@ -676,11 +722,34 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_irq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
+ */
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
+
/**
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
* to a domain.
@@ -713,12 +782,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
-
- bitmap_clear(allocated_irqs, from, cnt);
- mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -735,11 +801,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
*
* Returns the first irq number or error code
*/
-int __ref
-__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner, const struct irq_affinity_desc *affinity)
+int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+ struct module *owner, const struct irq_affinity_desc *affinity)
{
- int start, ret;
+ int start;
if (!cnt)
return -EINVAL;
@@ -757,23 +822,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from = arch_dynirq_lower_bound(from);
}
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
- from, cnt, 0);
- ret = -EEXIST;
+ start = irq_find_free_area(from, cnt);
if (irq >=0 && start != irq)
- goto unlock;
+ return -EEXIST;
if (start + cnt > nr_irqs) {
- ret = irq_expand_nr_irqs(start + cnt);
- if (ret)
- goto unlock;
+ if (!irq_expand_nr_irqs(start + cnt))
+ return -ENOMEM;
}
- ret = alloc_descs(start, cnt, node, affinity, owner);
-unlock:
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ return alloc_descs(start, cnt, node, affinity, owner);
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
@@ -785,30 +844,30 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return find_next_bit(allocated_irqs, nr_irqs, offset);
+ return irq_find_at_or_after(offset);
}
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check)
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
- if (desc) {
- if (check & _IRQ_DESC_CHECK) {
- if ((check & _IRQ_DESC_PERCPU) &&
- !irq_settings_is_per_cpu_devid(desc))
- return NULL;
-
- if (!(check & _IRQ_DESC_PERCPU) &&
- irq_settings_is_per_cpu_devid(desc))
- return NULL;
- }
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return NULL;
+
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
- if (bus)
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, *flags);
+ if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc))
+ return NULL;
}
+
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+
return desc;
}
@@ -820,15 +879,11 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
chip_bus_sync_unlock(desc);
}
-int irq_set_percpu_devid_partition(unsigned int irq,
- const struct cpumask *affinity)
+int irq_set_percpu_devid(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc)
- return -EINVAL;
-
- if (desc->percpu_enabled)
+ if (!desc || desc->percpu_enabled)
return -EINVAL;
desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
@@ -836,34 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq,
if (!desc->percpu_enabled)
return -ENOMEM;
- if (affinity)
- desc->percpu_affinity = affinity;
- else
- desc->percpu_affinity = cpu_possible_mask;
-
irq_set_percpu_devid_flags(irq);
return 0;
}
-int irq_set_percpu_devid(unsigned int irq)
-{
- return irq_set_percpu_devid_partition(irq, NULL);
-}
-
-int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc || !desc->percpu_enabled)
- return -EINVAL;
-
- if (affinity)
- cpumask_copy(affinity, desc->percpu_affinity);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
-
void kstat_incr_irq_this_cpu(unsigned int irq)
{
kstat_incr_irqs_this_cpu(irq_to_desc(irq));
@@ -882,33 +913,58 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc && desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+ return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-static bool irq_is_nmi(struct irq_desc *desc)
+static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
- return desc->istate & IRQS_NMI;
-}
-
-static unsigned int kstat_irqs(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
int cpu;
- if (!desc || !desc->kstat_irqs)
- return 0;
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
return data_race(desc->tot_count);
- for_each_possible_cpu(cpu)
- sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
+ for_each_cpu(cpu, cpumask)
+ sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
return sum;
}
+static unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return kstat_irqs_desc(desc, cpu_possible_mask);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+
+void kstat_snapshot_irqs(void)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->kstat_irqs)
+ continue;
+ this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+ }
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
+#endif
+
/**
* kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index bf38c546aa25..2652c4cfd877 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,7 +25,11 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq);
struct irqchip_fwid {
struct fwnode_handle fwnode;
@@ -107,6 +111,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
/**
* irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
+ * @fwnode: fwnode_handle to free
*
* Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
*/
@@ -114,7 +119,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -123,119 +128,252 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token)
{
- struct irqchip_fwid *fwid;
- struct irq_domain *domain;
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "%s", base);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode,
+ enum irq_domain_bus_token bus_token, const char *suffix)
+{
+ const char *sep = suffix ? "-" : "";
+ const char *suf = suffix ? : "";
+ char *name;
+
+ if (bus_token == DOMAIN_BUS_ANY)
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf);
+ else
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * fwnode paths contain '/', which debugfs is legitimately unhappy
+ * about. Replace them with ':', which does the trick and is not as
+ * offensive as '\'...
+ */
+ domain->name = strreplace(name, '/', ':');
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token)
+{
static atomic_t unknown_domains;
+ int id = atomic_inc_return(&unknown_domains);
- if (WARN_ON((size && direct_max) ||
- (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max)))
- return NULL;
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
- domain = kzalloc_node(struct_size(domain, revmap, size),
- GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
- if (!domain)
- return NULL;
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info)
+{
+ enum irq_domain_bus_token bus_token = info->bus_token;
+ const struct fwnode_handle *fwnode = info->fwnode;
if (is_fwnode_irqchip(fwnode)) {
- fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ /*
+ * The name_suffix is only intended to be used to avoid a name
+ * collision when multiple domains are created for a single
+ * device and the name is picked using a real device node.
+ * (Typical use-case is regmap-IRQ controllers for devices
+ * providing more than one physical IRQ.) There should be no
+ * need to use name_suffix with irqchip-fwnode.
+ */
+ if (info->name_suffix)
+ return -EINVAL;
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
- domain->fwnode = fwnode;
- domain->name = kstrdup(fwid->name, GFP_KERNEL);
- if (!domain->name) {
- kfree(domain);
- return NULL;
- }
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- break;
+ return alloc_name(domain, fwid->name, bus_token);
default:
- domain->fwnode = fwnode;
domain->name = fwid->name;
- break;
+ if (bus_token != DOMAIN_BUS_ANY)
+ return alloc_name(domain, fwid->name, bus_token);
}
- } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) ||
- is_software_node(fwnode)) {
- char *name;
- /*
- * fwnode paths contain '/', which debugfs is legitimately
- * unhappy about. Replace them with ':', which does
- * the trick and is not as offensive as '\'...
- */
- name = kasprintf(GFP_KERNEL, "%pfw", fwnode);
- if (!name) {
- kfree(domain);
- return NULL;
- }
+ } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) {
+ return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix);
+ }
- strreplace(name, '/', ':');
+ if (domain->name)
+ return 0;
- domain->name = name;
- domain->fwnode = fwnode;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- }
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
+ return alloc_unknown_name(domain, bus_token);
+}
- if (!domain->name) {
- if (fwnode)
- pr_err("Invalid fwnode type for irqdomain\n");
- domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
- atomic_inc_return(&unknown_domains));
- if (!domain->name) {
- kfree(domain);
- return NULL;
- }
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ int err;
+
+ if (WARN_ON((info->size && info->direct_max) ||
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) ||
+ (info->direct_max && info->direct_max != info->hwirq_max)))
+ return ERR_PTR(-EINVAL);
+
+ domain = kzalloc_node(struct_size(domain, revmap, info->size),
+ GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode)));
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ err = irq_domain_set_name(domain, info);
+ if (err) {
+ kfree(domain);
+ return ERR_PTR(err);
}
- fwnode_handle_get(fwnode);
- fwnode_dev_initialized(fwnode, true);
+ domain->fwnode = fwnode_handle_get(info->fwnode);
+ fwnode_dev_initialized(domain->fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_mutex);
- domain->ops = ops;
- domain->host_data = host_data;
- domain->hwirq_max = hwirq_max;
+ domain->ops = info->ops;
+ domain->host_data = info->host_data;
+ domain->bus_token = info->bus_token;
+ domain->hwirq_max = info->hwirq_max;
- if (direct_max) {
- size = direct_max;
+ if (info->direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- }
- domain->revmap_size = size;
+ domain->revmap_size = info->size;
+
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+static void irq_domain_free(struct irq_domain *domain)
+{
+ fwnode_dev_initialized(domain->fwnode, false);
+ fwnode_handle_put(domain->fwnode);
+ if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+ kfree(domain->name);
+ kfree(domain);
+}
+
+static void irq_domain_instantiate_descs(const struct irq_domain_info *info)
+{
+ if (!IS_ENABLED(CONFIG_SPARSE_IRQ))
+ return;
+
+ if (irq_alloc_descs(info->virq_base, info->virq_base, info->size,
+ of_node_to_nid(to_of_node(info->fwnode))) < 0) {
+ pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ info->virq_base);
+ }
+}
+
+static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info,
+ bool cond_alloc_descs, bool force_associate)
+{
+ struct irq_domain *domain;
+ int err;
+
+ domain = __irq_domain_create(info);
+ if (IS_ERR(domain))
+ return domain;
+
+ domain->flags |= info->domain_flags;
+ domain->exit = info->exit;
+ domain->dev = info->dev;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (info->parent) {
+ domain->root = info->parent->root;
+ domain->parent = info->parent;
+ }
+#endif
+
+ if (info->dgc_info) {
+ err = irq_domain_alloc_generic_chips(domain, info->dgc_info);
+ if (err)
+ goto err_domain_free;
+ }
+
+ if (info->init) {
+ err = info->init(domain);
+ if (err)
+ goto err_domain_gc_remove;
+ }
+
+ __irq_domain_publish(domain);
+
+ if (cond_alloc_descs && info->virq_base > 0)
+ irq_domain_instantiate_descs(info);
+
+ /*
+ * Legacy interrupt domains have a fixed Linux interrupt number
+ * associated. Other interrupt domains can request association by
+ * providing a Linux interrupt number > 0.
+ */
+ if (force_associate || info->virq_base > 0) {
+ irq_domain_associate_many(domain, info->virq_base, info->hwirq_base,
+ info->size - info->hwirq_base);
+ }
+
return domain;
+
+err_domain_gc_remove:
+ if (info->dgc_info)
+ irq_domain_remove_generic_chips(domain);
+err_domain_free:
+ irq_domain_free(domain);
+ return ERR_PTR(err);
}
-EXPORT_SYMBOL_GPL(__irq_domain_add);
+
+/**
+ * irq_domain_instantiate() - Instantiate a new irq domain data structure
+ * @info: Domain information pointer pointing to the information for this domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
+{
+ return __irq_domain_instantiate(info, false, false);
+}
+EXPORT_SYMBOL_GPL(irq_domain_instantiate);
/**
* irq_domain_remove() - Remove an irq domain.
@@ -247,6 +385,9 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);
*/
void irq_domain_remove(struct irq_domain *domain)
{
+ if (domain->exit)
+ domain->exit(domain);
+
mutex_lock(&irq_domain_mutex);
debugfs_remove_domain_dir(domain);
@@ -258,17 +399,15 @@ void irq_domain_remove(struct irq_domain *domain)
* If the going away domain is the default one, reset it.
*/
if (unlikely(irq_default_domain == domain))
- irq_set_default_host(NULL);
+ irq_set_default_domain(NULL);
mutex_unlock(&irq_domain_mutex);
- pr_debug("Removed domain %s\n", domain->name);
+ if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC)
+ irq_domain_remove_generic_chips(domain);
- fwnode_dev_initialized(domain->fwnode, false);
- fwnode_handle_put(domain->fwnode);
- if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
- kfree(domain->name);
- kfree(domain);
+ pr_debug("Removed domain %s\n", domain->name);
+ irq_domain_free(domain);
}
EXPORT_SYMBOL_GPL(irq_domain_remove);
@@ -328,55 +467,20 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct irq_domain *domain;
-
- domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
- if (!domain)
- return NULL;
-
- if (first_irq > 0) {
- if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
- /* attempt to allocated irq_descs */
- int rc = irq_alloc_descs(first_irq, first_irq, size,
- of_node_to_nid(to_of_node(fwnode)));
- if (rc < 0)
- pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
- first_irq);
- }
- irq_domain_associate_many(domain, first_irq, 0, size);
- }
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = size,
+ .hwirq_max = size,
+ .virq_base = first_irq,
+ .ops = ops,
+ .host_data = host_data,
+ };
+ struct irq_domain *domain = __irq_domain_instantiate(&info, true, false);
- return domain;
+ return IS_ERR(domain) ? NULL : domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_simple);
-/**
- * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in legacy mapping
- * @first_irq: first number of irq block assigned to the domain
- * @first_hwirq: first hwirq number to use for the translation. Should normally
- * be '0', but a positive integer can be used if the effective
- * hwirqs numbering does not begin at zero.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Note: the map() callback will be called before this function returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller).
- */
-struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- irq_hw_number_t first_hwirq,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- return irq_domain_create_legacy(of_node_to_fwnode(of_node), size,
- first_irq, first_hwirq, ops, host_data);
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
-
struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
unsigned int size,
unsigned int first_irq,
@@ -384,13 +488,18 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct irq_domain *domain;
-
- domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data);
- if (domain)
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = first_hwirq + size,
+ .hwirq_max = first_hwirq + size,
+ .hwirq_base = first_hwirq,
+ .virq_base = first_irq,
+ .ops = ops,
+ .host_data = host_data,
+ };
+ struct irq_domain *domain = __irq_domain_instantiate(&info, false, true);
- return domain;
+ return IS_ERR(domain) ? NULL : domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_legacy);
@@ -406,7 +515,8 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
struct fwnode_handle *fwnode = fwspec->fwnode;
int rc;
- /* We might want to match the legacy controller last since
+ /*
+ * We might want to match the legacy controller last since
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
@@ -417,7 +527,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
- if (h->ops->select && fwspec->param_count)
+ if (h->ops->select && bus_token != DOMAIN_BUS_ANY)
rc = h->ops->select(h, fwspec, bus_token);
else if (h->ops->match)
rc = h->ops->match(h, to_of_node(fwnode), bus_token);
@@ -437,32 +547,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
- * IRQ remapping
- *
- * Return: false if any MSI irq domain does not support IRQ remapping,
- * true otherwise (including if there is no MSI irq domain)
- */
-bool irq_domain_check_msi_remap(void)
-{
- struct irq_domain *h;
- bool ret = true;
-
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(h, &irq_domain_list, link) {
- if (irq_domain_is_msi(h) &&
- !irq_domain_hierarchical_is_msi_remap(h)) {
- ret = false;
- break;
- }
- }
- mutex_unlock(&irq_domain_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
-
-/**
- * irq_set_default_host() - Set a "default" irq domain
+ * irq_set_default_domain() - Set a "default" irq domain
* @domain: default domain pointer
*
* For convenience, it's possible to set a "default" domain that will be used
@@ -470,16 +555,16 @@ EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
* platforms that want to manipulate a few hard coded interrupt numbers that
* aren't properly represented in the device-tree.
*/
-void irq_set_default_host(struct irq_domain *domain)
+void irq_set_default_domain(struct irq_domain *domain)
{
pr_debug("Default domain set to @0x%p\n", domain);
irq_default_domain = domain;
}
-EXPORT_SYMBOL_GPL(irq_set_default_host);
+EXPORT_SYMBOL_GPL(irq_set_default_domain);
/**
- * irq_get_default_host() - Retrieve the "default" irq domain
+ * irq_get_default_domain() - Retrieve the "default" irq domain
*
* Returns: the default domain, if any.
*
@@ -487,11 +572,11 @@ EXPORT_SYMBOL_GPL(irq_set_default_host);
* systems that cannot implement a firmware->fwnode mapping (which
* both DT and ACPI provide).
*/
-struct irq_domain *irq_get_default_host(void)
+struct irq_domain *irq_get_default_domain(void)
{
return irq_default_domain;
}
-EXPORT_SYMBOL_GPL(irq_get_default_host);
+EXPORT_SYMBOL_GPL(irq_get_default_domain);
static bool irq_domain_is_nomap(struct irq_domain *domain)
{
@@ -502,30 +587,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], NULL);
else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], irq_data);
else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -538,6 +627,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -557,10 +649,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -573,7 +667,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -590,23 +683,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -619,9 +718,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -650,9 +748,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_size) {
- pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_size);
+ if (virq >= domain->hwirq_max) {
+ pr_err("ERROR: no free irqs available below %lu maximum\n",
+ domain->hwirq_max);
irq_free_desc(virq);
return 0;
}
@@ -668,6 +766,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
#endif
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
+
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
@@ -680,14 +806,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
* on the number returned from that call.
*/
unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
- irq_hw_number_t hwirq,
- const struct irq_affinity_desc *affinity)
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
@@ -695,32 +818,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
- of_node = irq_domain_get_of_node(domain);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
- affinity);
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
- }
-
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
- }
-
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(of_node), virq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -749,7 +859,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
{
int i;
- fwspec->fwnode = of_node_to_fwnode(np);
+ fwspec->fwnode = of_fwnode_handle(np);
fwspec->param_count = count;
for (i = 0; i < count; i++)
@@ -757,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
}
EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
-unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
- struct irq_data *irq_data;
- irq_hw_number_t hwirq;
- unsigned int type = IRQ_TYPE_NONE;
- int virq;
if (fwspec->fwnode) {
domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
@@ -773,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
domain = irq_default_domain;
}
+ return domain;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+ struct irq_domain *domain = fwspec_to_domain(fwspec);
+
+ memset(info, 0, sizeof(*info));
+
+ if (!domain || !domain->ops->get_fwspec_info)
+ return 0;
+
+ return domain->ops->get_fwspec_info(fwspec, info);
+}
+#endif
+
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+{
+ unsigned int type = IRQ_TYPE_NONE;
+ struct irq_domain *domain;
+ struct irq_data *irq_data;
+ irq_hw_number_t hwirq;
+ int virq;
+
+ domain = fwspec_to_domain(fwspec);
if (!domain) {
pr_warn("no irq domain found for %s !\n",
of_node_full_name(to_of_node(fwspec->fwnode)));
@@ -789,6 +921,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -801,7 +935,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -809,40 +943,50 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ if (irq_domain_is_msi_device(domain)) {
+ mutex_unlock(&domain->root->mutex);
+ virq = msi_device_domain_alloc_wired(domain, hwirq, type);
+ mutex_lock(&domain->root->mutex);
+ } else
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -865,10 +1009,11 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
*/
void irq_dispose_mapping(unsigned int virq)
{
- struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data;
struct irq_domain *domain;
- if (!virq || !irq_data)
+ irq_data = virq ? irq_get_irq_data(virq) : NULL;
+ if (!irq_data)
return;
domain = irq_data->domain;
@@ -876,7 +1021,7 @@ void irq_dispose_mapping(unsigned int virq)
return;
if (irq_domain_is_hierarchy(domain)) {
- irq_domain_free_irqs(virq, 1);
+ irq_domain_free_one_irq(domain, virq);
} else {
irq_domain_disassociate(domain, virq);
irq_free_desc(virq);
@@ -906,10 +1051,12 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
return desc;
if (irq_domain_is_nomap(domain)) {
- if (hwirq < domain->revmap_size) {
+ if (hwirq < domain->hwirq_max) {
data = irq_domain_get_irq_data(domain, hwirq);
if (data && data->hwirq == hwirq)
desc = irq_data_to_desc(data);
+ if (irq && desc)
+ *irq = hwirq;
}
return desc;
@@ -935,6 +1082,12 @@ EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with one cell
* bindings where the cell value maps directly to the hwirq number.
@@ -953,6 +1106,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
/**
* irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
@@ -970,7 +1129,38 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
+ * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Device Tree interrupt specifier translation function for two or three
+ * cell bindings, where the cell values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+
+ return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type);
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell);
+
+/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with either one
* or two cell bindings where the cell values map directly to the hwirq number
@@ -1004,6 +1194,10 @@ EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
/**
* irq_domain_translate_onecell() - Generic translate for direct one cell
* bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*/
int irq_domain_translate_onecell(struct irq_domain *d,
struct irq_fwspec *fwspec,
@@ -1021,6 +1215,10 @@ EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
/**
* irq_domain_translate_twocell() - Generic translate for direct two cell
* bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
@@ -1039,6 +1237,37 @@ int irq_domain_translate_twocell(struct irq_domain *d,
}
EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+/**
+ * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Firmware interrupt specifier translation function for two or three cell
+ * specifications, where the parameter values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (fwspec->param_count == 2) {
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ if (fwspec->param_count == 3) {
+ *out_hwirq = fwspec->param[1];
+ *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node, const struct irq_affinity_desc *affinity)
{
@@ -1048,7 +1277,7 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
affinity);
} else {
- hint = hwirq % nr_irqs;
+ hint = hwirq % irq_get_nr_irqs();
if (hint == 0)
hint++;
virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
@@ -1075,43 +1304,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
-/**
- * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
- * @parent: Parent irq domain to associate with the new domain
- * @flags: Irq domain flags associated to the domain
- * @size: Size of the domain. See below
- * @fwnode: Optional fwnode of the interrupt controller
- * @ops: Pointer to the interrupt domain callbacks
- * @host_data: Controller private data pointer
- *
- * If @size is 0 a tree domain is created, otherwise a linear domain.
- *
- * If successful the parent is associated to the new domain and the
- * domain flags are set.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
- unsigned int flags,
- unsigned int size,
- struct fwnode_handle *fwnode,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- struct irq_domain *domain;
-
- if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
- else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
- if (domain) {
- domain->parent = parent;
- domain->flags |= flags;
- }
-
- return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
-
static void irq_domain_insert_irq(int virq)
{
struct irq_data *data;
@@ -1121,10 +1313,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1226,7 +1414,7 @@ static int irq_domain_trim_hierarchy(unsigned int virq)
tail = NULL;
/* The first entry must have a valid irqchip */
- if (!irq_data->chip || IS_ERR(irq_data->chip))
+ if (IS_ERR_OR_NULL(irq_data->chip))
return -EINVAL;
/*
@@ -1319,7 +1507,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1328,7 +1517,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1347,7 +1536,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1395,6 +1584,7 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
}
irq_domain_free_irqs_common(domain, virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top);
static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
@@ -1411,9 +1601,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
}
}
-int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
- unsigned int irq_base,
- unsigned int nr_irqs, void *arg)
+static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
{
if (!domain->ops->alloc) {
pr_debug("domain->ops->alloc() is NULL\n");
@@ -1423,40 +1612,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program the hardware with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1475,24 +1636,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
- }
for (i = 0; i < nr_irqs; i++) {
ret = irq_domain_trim_hierarchy(virq + i);
- if (ret) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret)
goto out_free_irq_data;
- }
}
-
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1502,6 +1657,48 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
@@ -1509,11 +1706,12 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
+ lockdep_assert_held(&d->domain->root->mutex);
+
if (irq_domain_is_nomap(d->domain))
return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_mutex);
if (d->hwirq < d->domain->revmap_size) {
/* Not using radix tree */
rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
@@ -1522,7 +1720,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
- mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1538,8 +1735,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1564,47 +1761,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1620,8 +1816,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1643,37 +1839,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1687,22 +1883,33 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
}
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq)
+{
+ if (irq_domain_is_msi_device(domain))
+ msi_device_domain_free_wired(domain, virq);
+ else
+ irq_domain_free_irqs(virq, 1);
+}
+
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
* @domain: Domain below which interrupts must be allocated
@@ -1812,22 +2019,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
-
-/**
- * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
- * parent has MSI remapping support
- * @domain: domain pointer
- */
-bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
-{
- for (; domain; domain = domain->parent) {
- if (irq_domain_is_msi_remap(domain))
- return true;
- }
- return false;
-}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
-/**
+/*
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
* @domain: domain to match
* @virq: IRQ number to get irq_data
@@ -1841,7 +2034,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
-/**
+/*
* irq_domain_set_info - Set the complete data for a @virq in @domain
* @domain: Interrupt domain to match
* @virq: IRQ number
@@ -1853,7 +2046,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1862,21 +2055,43 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
-static void irq_domain_check_hierarchy(struct irq_domain *domain)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
+ return -EINVAL;
}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain) { }
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { }
+
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
static struct dentry *domain_dir;
-static void
-irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
+static const struct irq_bit_descr irqdomain_flags[] = {
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY),
+ BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE),
+};
+
+static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
{
seq_printf(m, "%*sname: %s\n", ind, "", d->name);
seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags));
if (d->ops && d->ops->debug_show)
d->ops->debug_show(m, d, NULL, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -1912,7 +2127,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c
new file mode 100644
index 000000000000..1a3deffe6b5b
--- /dev/null
+++ b/kernel/irq/kexec.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqnr.h>
+
+#include "internals.h"
+
+void machine_kexec_mask_interrupts(void)
+{
+ struct irq_desc *desc;
+ unsigned int i;
+
+ for_each_irq_desc(i, desc) {
+ struct irq_chip *chip;
+ int check_eoi = 1;
+
+ chip = irq_desc_get_chip(desc);
+ if (!chip || !irqd_is_started(&desc->irq_data))
+ continue;
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) {
+ /*
+ * First try to remove the active state from an interrupt which is forwarded
+ * to a VM. If the interrupt is not forwarded, try to EOI the interrupt.
+ */
+ check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
+ }
+
+ if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+ chip->irq_eoi(&desc->irq_data);
+
+ irq_shutdown(desc);
+ }
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f23ffd30385b..0bb29316b436 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,14 +35,14 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
+static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
+
static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
struct irq_data *irqd = irq_desc_get_irq_data(desc);
bool inprogress;
do {
- unsigned long flags;
-
/*
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
@@ -51,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
inprogress = irqd_irq_inprogress(&desc->irq_data);
/*
@@ -67,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
__irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
&inprogress);
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
/* Oops, that failed? */
} while (inprogress);
}
/**
- * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this
- * function while holding a resource the IRQ handler may need you
- * will deadlock. It does not take associated threaded handlers
- * into account.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock. It does not take
+ * associated threaded handlers into account.
*
- * Do not use this for shutdown scenarios where you must be sure
- * that all parts (hardirq and threaded handler) have completed.
+ * Do not use this for shutdown scenarios where you must be sure that all
+ * parts (hardirq and threaded handler) have completed.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
*
- * It does not check whether there is an interrupt in flight at the
- * hardware level, but not serviced yet, as this might deadlock when
- * called with interrupts disabled and the target CPU of the interrupt
- * is the current CPU.
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when called
+ * with interrupts disabled and the target CPU of the interrupt is the
+ * current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
@@ -108,35 +105,37 @@ bool synchronize_hardirq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_hardirq);
+static void __synchronize_irq(struct irq_desc *desc)
+{
+ __synchronize_hardirq(desc, true);
+ /*
+ * We made sure that no hardirq handler is running. Now verify that no
+ * threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+}
+
/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Can only be called from preemptible code as it might sleep when
- * an interrupt thread is associated to @irq.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
*
- * It optionally makes sure (when the irq chip supports that method)
- * that the interrupt is not pending in any CPU and waiting for
- * service.
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc) {
- __synchronize_hardirq(desc, true);
- /*
- * We made sure that no hardirq handler is
- * running. Now verify that no threaded handlers are
- * active.
- */
- wait_event(desc->wait_for_threads,
- !atomic_read(&desc->threads_active));
- }
+ if (desc)
+ __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);
@@ -152,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc)
}
/**
- * irq_can_set_affinity - Check if the affinity of a given irq can be set
- * @irq: Interrupt to check
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
*
*/
int irq_can_set_affinity(unsigned int irq)
@@ -177,21 +176,28 @@ bool irq_can_set_affinity_usr(unsigned int irq)
}
/**
- * irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affinity changed
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affinity changed
*
- * We just set IRQTF_AFFINITY and delegate the affinity setting
- * to the interrupt thread itself. We can not call
- * set_cpus_allowed_ptr() here as we hold desc->lock and this
- * code can be called from hard interrupt context.
+ * Just set IRQTF_AFFINITY and delegate the affinity setting to the
+ * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as
+ * we hold desc->lock and this code can be called from hard interrupt
+ * context.
*/
-void irq_set_thread_affinity(struct irq_desc *desc)
+static void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
- if (action->thread)
+ for_each_action_of_desc(desc, action) {
+ if (action->thread) {
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ wake_up_process(action->thread);
+ }
+ if (action->secondary && action->secondary->thread) {
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ wake_up_process(action->secondary->thread);
+ }
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -205,23 +211,19 @@ static void irq_validate_effective_affinity(struct irq_data *data)
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
}
-
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask)
-{
- cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
-}
#else
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask) { }
#endif
+static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
+ struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
+ const struct cpumask *prog_mask;
int ret;
if (!chip || !chip->irq_set_affinity)
@@ -247,25 +249,33 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
- const struct cpumask *hk_mask, *prog_mask;
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask;
- static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
- static struct cpumask tmp_mask;
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
-
- raw_spin_lock(&tmp_mask_lock);
- cpumask_and(&tmp_mask, mask, hk_mask);
- if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
+ cpumask_and(tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
prog_mask = mask;
else
- prog_mask = &tmp_mask;
- ret = chip->irq_set_affinity(data, prog_mask, force);
- raw_spin_unlock(&tmp_mask_lock);
+ prog_mask = tmp_mask;
} else {
- ret = chip->irq_set_affinity(data, mask, force);
+ prog_mask = mask;
}
+
+ /*
+ * Make sure we only provide online CPUs to the irqchip,
+ * unless we are being asked to force the affinity (in which
+ * case we do as we are told).
+ */
+ cpumask_and(tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(tmp_mask))
+ ret = chip->irq_set_affinity(data, tmp_mask, force);
+ else if (force)
+ ret = chip->irq_set_affinity(data, mask, force);
+ else
+ ret = -EINVAL;
+
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -314,7 +324,7 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
@@ -332,7 +342,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
return false;
cpumask_copy(desc->irq_common_data.affinity, mask);
- irq_init_effective_affinity(data, mask);
+ irq_data_update_effective_affinity(data, mask);
irqd_set(data, IRQD_AFFINITY_SET);
return true;
}
@@ -347,7 +357,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
@@ -385,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
* an interrupt which is already started or which has already been configured
* as managed will also fail, as these mean invalid init state or double init.
*/
-int irq_update_affinity_desc(unsigned int irq,
- struct irq_affinity_desc *affinity)
+int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity)
{
- struct irq_desc *desc;
- unsigned long flags;
- bool activated;
- int ret = 0;
-
/*
* Supporting this with the reservation scheme used by x86 needs
* some more thought. Fail it for now.
@@ -400,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq,
if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
return -EOPNOTSUPP;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ bool activated;
- /* Requires the interrupt to be shut down */
- if (irqd_is_started(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- /* Interrupts which are already managed cannot be modified */
- if (irqd_affinity_is_managed(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ /* Requires the interrupt to be shut down */
+ if (irqd_is_started(&desc->irq_data))
+ return -EBUSY;
- /*
- * Deactivate the interrupt. That's required to undo
- * anything an earlier activation has established.
- */
- activated = irqd_is_activated(&desc->irq_data);
- if (activated)
- irq_domain_deactivate_irq(&desc->irq_data);
-
- if (affinity->is_managed) {
- irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
- irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
- }
+ /* Interrupts which are already managed cannot be modified */
+ if (irqd_affinity_is_managed(&desc->irq_data))
+ return -EBUSY;
+ /*
+ * Deactivate the interrupt. That's required to undo
+ * anything an earlier activation has established.
+ */
+ activated = irqd_is_activated(&desc->irq_data);
+ if (activated)
+ irq_domain_deactivate_irq(&desc->irq_data);
- cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+ if (affinity->is_managed) {
+ irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+ irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+ }
- /* Restore the activation state */
- if (activated)
- irq_domain_activate_irq(&desc->irq_data, false);
+ cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ /* Restore the activation state */
+ if (activated)
+ irq_domain_activate_irq(&desc->irq_data, false);
+ return 0;
+ }
+ return -EINVAL;
}
static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
if (!desc)
return -EINVAL;
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
}
/**
@@ -486,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
- bool setaffinity)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
- desc->affinity_hint = m;
- irq_put_desc_unlock(desc, flags);
- if (m && setaffinity)
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->affinity_hint = m;
+ ret = 0;
+ }
+
+ if (!ret && m && setaffinity)
__irq_set_affinity(irq, m, false);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
- struct irq_affinity_notify *notify =
- container_of(work, struct irq_affinity_notify, work);
+ struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work);
struct irq_desc *desc = irq_to_desc(notify->irq);
cpumask_var_t cpumask;
- unsigned long flags;
if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
goto out;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_move_pending(&desc->irq_data))
- irq_get_pending(cpumask, desc);
- else
- cpumask_copy(cpumask, desc->irq_common_data.affinity);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_common_data.affinity);
+ }
notify->notify(notify, cpumask);
@@ -528,27 +519,25 @@ out:
}
/**
- * irq_set_affinity_notifier - control notification of IRQ affinity changes
- * @irq: Interrupt for which to enable/disable notification
- * @notify: Context for notification, or %NULL to disable
- * notification. Function pointers must be initialised;
- * the other fields will be initialised by this function.
- *
- * Must be called in process context. Notification may only be enabled
- * after the IRQ is allocated and must be disabled before the IRQ is
- * freed using free_irq().
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is freed
+ * using free_irq().
*/
-int
-irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_affinity_notify *old_notify;
- unsigned long flags;
/* The release function is promised process context */
might_sleep();
- if (!desc || desc->istate & IRQS_NMI)
+ if (!desc || irq_is_nmi(desc))
return -EINVAL;
/* Complete initialisation of *notify */
@@ -558,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
INIT_WORK(&notify->work, irq_affinity_notify);
}
- raw_spin_lock_irqsave(&desc->lock, flags);
- old_notify = desc->affinity_notify;
- desc->affinity_notify = notify;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ }
if (old_notify) {
if (cancel_work_sync(&old_notify->work)) {
@@ -582,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
int irq_setup_affinity(struct irq_desc *desc)
{
struct cpumask *set = irq_default_affinity;
- int ret, node = irq_desc_get_node(desc);
+ int node = irq_desc_get_node(desc);
+
static DEFINE_RAW_SPINLOCK(mask_lock);
static struct cpumask mask;
@@ -590,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (!__irq_can_set_affinity(desc))
return 0;
- raw_spin_lock(&mask_lock);
+ guard(raw_spinlock)(&mask_lock);
/*
* Preserve the managed affinity setting and a userspace affinity
* setup, but make sure that one of the targets is online.
@@ -615,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (cpumask_intersects(&mask, nodemask))
cpumask_and(&mask, &mask, nodemask);
}
- ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
- raw_spin_unlock(&mask_lock);
- return ret;
+ return irq_do_set_affinity(&desc->irq_data, &mask, false);
}
#else
/* Wrapper for ALPHA specific affinity selector magic */
@@ -630,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc)
/**
- * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
- * @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
- * specific data for percpu_devid interrupts
- *
- * This function uses the vCPU specific data to set the vCPU
- * affinity for an irq. The vCPU specific data is passed from
- * outside, such as KVM. One example code path is as below:
- * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
+ *
+ * This function uses the vCPU specific data to set the vCPU affinity for
+ * an irq. The vCPU specific data is passed from outside, such as KVM. One
+ * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity().
*/
int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- struct irq_data *data;
- struct irq_chip *chip;
- int ret = -ENOSYS;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ struct irq_data *data;
+ struct irq_chip *chip;
- if (!desc)
- return -EINVAL;
-
- data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ data = irq_desc_get_irq_data(desc);
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
- if (data)
- ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
- irq_put_desc_unlock(desc, flags);
+ data = irqd_get_parent_data(data);
+ } while (data);
- return ret;
+ if (!data)
+ return -ENOSYS;
+ return chip->irq_set_vcpu_affinity(data, vcpu_info);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
@@ -679,26 +659,23 @@ void __disable_irq(struct irq_desc *desc)
static int __disable_irq_nosync(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- __disable_irq(desc);
- irq_put_desc_busunlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ __disable_irq(scoped_irqdesc);
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * disable_irq_nosync - disable an irq without waiting
- * @irq: Interrupt to disable
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Disables and Enables are
- * nested.
- * Unlike disable_irq(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context.
*/
void disable_irq_nosync(unsigned int irq)
{
@@ -707,59 +684,61 @@ void disable_irq_nosync(unsigned int irq)
EXPORT_SYMBOL(disable_irq_nosync);
/**
- * disable_irq - disable an irq and wait for completion
- * @irq: Interrupt to disable
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
+ *
+ * Can only be called from preemptible code as it might sleep when an
+ * interrupt thread is associated to @irq.
*
- * This function may be called - with care - from IRQ context.
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);
/**
- * disable_hardirq - disables an irq and waits for hardirq completion
- * @irq: Interrupt to disable
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this function while
- * holding a resource the hard IRQ handler may need you will deadlock.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the hard IRQ handler may need you will deadlock.
*
- * When used to optimistically disable an interrupt from atomic context
- * the return value must be checked.
+ * When used to optimistically disable an interrupt from atomic context the
+ * return value must be checked.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
*/
bool disable_hardirq(unsigned int irq)
{
if (!__disable_irq_nosync(irq))
return synchronize_hardirq(irq);
-
return false;
}
EXPORT_SYMBOL_GPL(disable_hardirq);
/**
- * disable_nmi_nosync - disable an nmi without waiting
- * @irq: Interrupt to disable
- *
- * Disable the selected interrupt line. Disables and enables are
- * nested.
- * The interrupt to disable must have been requested through request_nmi.
- * Unlike disable_nmi(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are nested.
+ *
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*/
void disable_nmi_nosync(unsigned int irq)
{
@@ -781,10 +760,14 @@ void __enable_irq(struct irq_desc *desc)
irq_settings_set_noprobe(desc);
/*
* Call irq_startup() not irq_enable() here because the
- * interrupt might be marked NOAUTOEN. So irq_startup()
- * needs to be invoked when it gets enabled the first
- * time. If it was already started up, then irq_startup()
- * will invoke irq_enable() under the hood.
+ * interrupt might be marked NOAUTOEN so irq_startup()
+ * needs to be invoked when it gets enabled the first time.
+ * This is also required when __enable_irq() is invoked for
+ * a managed and shutdown interrupt from the S3 resume
+ * path.
+ *
+ * If it was already started up, then irq_startup() will
+ * invoke irq_enable() under the hood.
*/
irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
@@ -795,41 +778,34 @@ void __enable_irq(struct irq_desc *desc)
}
/**
- * enable_irq - enable handling of an irq
- * @irq: Interrupt to enable
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
*
- * Undoes the effect of one call to disable_irq(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * Undoes the effect of one call to disable_irq(). If this matches the
+ * last disable, processing of interrupts on this IRQ line is re-enabled.
*
- * This function may be called from IRQ context only when
- * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ * This function may be called from IRQ context only when
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return;
- if (WARN(!desc->irq_data.chip,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- goto out;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- __enable_irq(desc);
-out:
- irq_put_desc_busunlock(desc, flags);
+ if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
+ return;
+ __enable_irq(desc);
+ }
}
EXPORT_SYMBOL(enable_irq);
/**
- * enable_nmi - enable handling of an nmi
- * @irq: Interrupt to enable
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
*
- * The interrupt to enable must have been requested through request_nmi.
- * Undoes the effect of one call to disable_nmi(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this matches the last
+ * disable, processing of interrupts on this IRQ line is re-enabled.
*/
void enable_nmi(unsigned int irq)
{
@@ -851,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * irq_set_irq_wake - control irq power management wakeup
- * @irq: interrupt to control
- * @on: enable/disable power management wakeup
- *
- * Enable/disable power management wakeup mode, which is
- * disabled by default. Enables and disables must match,
- * just as they match for non-wakeup mode support.
- *
- * Wakeup mode lets this IRQ wake the system from sleep
- * states like "suspend to RAM".
- *
- * Note: irq enable/disable state is completely orthogonal
- * to the enable/disable state of irq wake. An irq can be
- * disabled with disable_irq() and still wake the system as
- * long as the irq has wake enabled. If this does not hold,
- * then the underlying irq chip and the related driver need
- * to be investigated.
+ * irq_set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is disabled by
+ * default. Enables and disables must match, just as they match for
+ * non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep states like
+ * "suspend to RAM".
+ *
+ * Note: irq enable/disable state is completely orthogonal to the
+ * enable/disable state of irq wake. An irq can be disabled with
+ * disable_irq() and still wake the system as long as the irq has wake
+ * enabled. If this does not hold, then the underlying irq chip and the
+ * related driver need to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
+ int ret = 0;
- if (!desc)
- return -EINVAL;
-
- /* Don't use NMIs as wake up interrupts please */
- if (desc->istate & IRQS_NMI) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ /* Don't use NMIs as wake up interrupts please */
+ if (irq_is_nmi(desc))
+ return -EINVAL;
- /* wakeup-capable irqs can be shared between drivers that
- * don't need to have the same sleep mode behaviors.
- */
- if (on) {
- if (desc->wake_depth++ == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 0;
- else
- irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
- }
- } else {
- if (desc->wake_depth == 0) {
- WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
- } else if (--desc->wake_depth == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 1;
- else
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ /*
+ * wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
}
+ return ret;
}
-
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_wake);
@@ -918,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake);
* particular irq has been exclusively allocated or is available
* for driver use.
*/
-int can_request_irq(unsigned int irq, unsigned long irqflags)
+bool can_request_irq(unsigned int irq, unsigned long irqflags)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- int canrequest = 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return 0;
-
- if (irq_settings_can_request(desc)) {
- if (!desc->action ||
- irqflags & desc->action->flags & IRQF_SHARED)
- canrequest = 1;
+ if (irq_settings_can_request(desc)) {
+ if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED)
+ return true;
+ }
}
- irq_put_desc_unlock(desc, flags);
- return canrequest;
+ return false;
}
int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
@@ -994,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
-
- desc->parent_irq = parent_irq;
-
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->parent_irq = parent_irq;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
@@ -1034,10 +994,48 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
return IRQ_NONE;
}
-static int irq_wait_for_interrupt(struct irqaction *action)
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ const struct cpumask *m;
+
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ }
+
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
+static int irq_wait_for_interrupt(struct irq_desc *desc,
+ struct irqaction *action)
{
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
+ irq_thread_check_affinity(desc, action);
if (kthread_should_stop()) {
/* may need to run one last time */
@@ -1114,51 +1112,21 @@ out_unlock:
chip_bus_sync_unlock(desc);
}
-#ifdef CONFIG_SMP
/*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Interrupts explicitly requested as threaded interrupts want to be
+ * preemptible - many of them need to sleep and wait for slow busses to
+ * complete.
*/
-static void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
- cpumask_var_t mask;
- bool valid = true;
-
- if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
- return;
-
- /*
- * In case we are out of memory we set IRQTF_AFFINITY again and
- * try again next time
- */
- if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
- set_bit(IRQTF_AFFINITY, &action->thread_flags);
- return;
- }
+ irqreturn_t ret = action->thread_fn(action->irq, action->dev_id);
- raw_spin_lock_irq(&desc->lock);
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (cpumask_available(desc->irq_common_data.affinity)) {
- const struct cpumask *m;
-
- m = irq_data_get_effective_affinity_mask(&desc->irq_data);
- cpumask_copy(mask, m);
- } else {
- valid = false;
- }
- raw_spin_unlock_irq(&desc->lock);
+ if (ret == IRQ_HANDLED)
+ atomic_inc(&desc->threads_handled);
- if (valid)
- set_cpus_allowed_ptr(current, mask);
- free_cpumask_var(mask);
+ irq_finalize_oneshot(desc, action);
+ return ret;
}
-#else
-static inline void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
-#endif
/*
* Interrupts which are not explicitly requested as threaded
@@ -1166,44 +1134,21 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
* context. So we need to disable bh here to avoid deadlocks and other
* side effects.
*/
-static irqreturn_t
-irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t ret;
local_bh_disable();
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_disable();
- ret = action->thread_fn(action->irq, action->dev_id);
- if (ret == IRQ_HANDLED)
- atomic_inc(&desc->threads_handled);
-
- irq_finalize_oneshot(desc, action);
+ ret = irq_thread_fn(desc, action);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_enable();
local_bh_enable();
return ret;
}
-/*
- * Interrupts explicitly requested as threaded interrupts want to be
- * preemptible - many of them need to sleep and wait for slow busses to
- * complete.
- */
-static irqreturn_t irq_thread_fn(struct irq_desc *desc,
- struct irqaction *action)
-{
- irqreturn_t ret;
-
- ret = action->thread_fn(action->irq, action->dev_id);
- if (ret == IRQ_HANDLED)
- atomic_inc(&desc->threads_handled);
-
- irq_finalize_oneshot(desc, action);
- return ret;
-}
-
-static void wake_threads_waitq(struct irq_desc *desc)
+void wake_threads_waitq(struct irq_desc *desc)
{
if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
@@ -1243,9 +1188,33 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
if (WARN_ON_ONCE(!secondary))
return;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
__irq_wake_thread(desc, secondary);
- raw_spin_unlock_irq(&desc->lock);
+}
+
+/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
}
/*
@@ -1259,7 +1228,12 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- sched_set_fifo(current);
+ irq_thread_set_ready(desc, action);
+
+ if (action->handler == irq_forced_secondary_handler)
+ sched_set_fifo_secondary(current);
+ else
+ sched_set_fifo(current);
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
@@ -1270,13 +1244,9 @@ static int irq_thread(void *data)
init_task_work(&on_exit_work, irq_thread_dtor);
task_work_add(current, &on_exit_work, TWA_NONE);
- irq_thread_check_affinity(desc, action);
-
- while (!irq_wait_for_interrupt(action)) {
+ while (!irq_wait_for_interrupt(desc, action)) {
irqreturn_t action_ret;
- irq_thread_check_affinity(desc, action);
-
action_ret = handler_fn(desc, action);
if (action_ret == IRQ_WAKE_THREAD)
irq_wake_secondary(desc, action);
@@ -1290,26 +1260,24 @@ static int irq_thread(void *data)
* synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
* oneshot mask bit can be set.
*/
- task_work_cancel(current, irq_thread_dtor);
+ task_work_cancel_func(current, irq_thread_dtor);
return 0;
}
/**
- * irq_wake_thread - wake the irq thread for the action identified by dev_id
- * @irq: Interrupt line
- * @dev_id: Device identity for which the thread should be woken
- *
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
*/
void irq_wake_thread(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
@@ -1317,7 +1285,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
break;
}
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(irq_wake_thread);
@@ -1432,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* references an already freed task_struct.
*/
new->thread = get_task_struct(t);
+
/*
- * Tell the thread to set its affinity. This is
- * important for shared interrupt handlers as we do
- * not invoke setup_affinity() for the secondary
- * handlers as everything is already set up. Even for
- * interrupts marked with IRQF_NO_BALANCE this is
- * correct as we want the thread to move to the cpu(s)
- * on which the requesting code placed the interrupt.
+ * The affinity can not be established yet, but it will be once the
+ * interrupt is enabled. Delay and defer the actual setting to the
+ * thread itself once it is ready to run. In the meantime, prevent
+ * it from ever being re-affined directly by cpuset or
+ * housekeeping. The proper way to do it is to re-affine the whole
+ * vector.
*/
- set_bit(IRQTF_AFFINITY, &new->thread_flags);
+ kthread_bind_mask(t, cpu_possible_mask);
+
+ /*
+ * Ensure the thread adjusts the affinity once it reaches the
+ * thread function.
+ */
+ new->thread_flags = BIT(IRQTF_AFFINITY);
+
return 0;
}
+static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new)
+{
+ do {
+ if (cpumask_intersects(old->affinity, new->affinity) ||
+ old->percpu_dev_id == new->percpu_dev_id)
+ return false;
+
+ old = old->next;
+ } while (old);
+
+ return true;
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -1465,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
struct irqaction *old, **old_ptr;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
+ bool per_cpu_devid;
if (!desc)
return -EINVAL;
@@ -1474,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!try_module_get(desc->owner))
return -ENODEV;
+ per_cpu_devid = irq_settings_is_per_cpu_devid(desc);
+
new->irq = irq;
/*
@@ -1581,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
unsigned int oldtype;
- if (desc->istate & IRQS_NMI) {
+ if (irq_is_nmi(desc) && !per_cpu_devid) {
pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
new->name, irq, desc->irq_data.chip->name);
ret = -EINVAL;
goto out_unlock;
}
+ if (per_cpu_devid && !valid_percpu_irqaction(old, new)) {
+ pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1600,8 +1597,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT))
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)))
+ goto mismatch;
+
+ if ((old->flags & IRQF_ONESHOT) &&
+ (new->flags & IRQF_COND_ONESHOT))
+ new->flags |= IRQF_ONESHOT;
+ else if ((old->flags ^ new->flags) & IRQF_ONESHOT)
goto mismatch;
/* All handlers must agree on per-cpuness */
@@ -1683,8 +1685,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
@@ -1735,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!(new->flags & IRQF_NO_AUTOEN) &&
irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
- } else {
+ } else if (!per_cpu_devid) {
/*
* Shared interrupts do not go well with disabling
* auto enable. The sharing interrupt might request
@@ -1780,14 +1780,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq_setup_timings(desc, new);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
- if (new->secondary)
- wake_up_process(new->secondary->thread);
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
@@ -1818,15 +1812,13 @@ out_thread:
struct task_struct *t = new->thread;
new->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
if (new->secondary && new->secondary->thread) {
struct task_struct *t = new->secondary->thread;
new->secondary->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
out_mput:
module_put(desc->owner);
@@ -1912,7 +1904,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* supports it also make sure that there is no (not yet serviced)
* interrupt in flight at the hardware level.
*/
- __synchronize_hardirq(desc, true);
+ __synchronize_irq(desc);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1937,12 +1929,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* the same bit to a newly requested action.
*/
if (action->thread) {
- kthread_stop(action->thread);
- put_task_struct(action->thread);
- if (action->secondary && action->secondary->thread) {
- kthread_stop(action->secondary->thread);
- put_task_struct(action->secondary->thread);
- }
+ kthread_stop_put(action->thread);
+ if (action->secondary && action->secondary->thread)
+ kthread_stop_put(action->secondary->thread);
}
/* Last action releases resources */
@@ -1956,9 +1945,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* There is no interrupt on the fly anymore. Deactivate it
* completely.
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
- irq_domain_deactivate_irq(&desc->irq_data);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
@@ -1974,20 +1962,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
}
/**
- * free_irq - free an interrupt allocated with request_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
+ * Remove an interrupt handler. The handler is removed and if the interrupt
+ * line is no longer in use by any driver it is disabled. On a shared IRQ
+ * the caller must ensure the interrupt is disabled on the card it drives
+ * before calling this function. The function does not return until any
+ * executing interrupts for this IRQ have completed.
*
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*
- * Returns the devname argument passed to request_irq.
+ * Returns the devname argument passed to request_irq.
*/
const void *free_irq(unsigned int irq, void *dev_id)
{
@@ -2044,10 +2031,8 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
const void *free_nmi(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- const void *devname;
- if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (!desc || WARN_ON(!irq_is_nmi(desc)))
return NULL;
if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
@@ -2057,53 +2042,46 @@ const void *free_nmi(unsigned int irq, void *dev_id)
if (WARN_ON(desc->depth == 0))
disable_nmi_nosync(irq);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
irq_nmi_teardown(desc);
- devname = __cleanup_nmi(irq, desc);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return devname;
+ return __cleanup_nmi(irq, desc);
}
/**
- * request_threaded_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts.
- * If handler is NULL and thread_fn != NULL
- * the default primary handler is installed.
- * @thread_fn: Function called from the irq handler thread
- * If NULL, no irq thread is created
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. From the point this
- * call is made your handler function may be invoked. Since
- * your handler function must clear any interrupt the board
- * raises, you must take care both to initialise your hardware
- * and to set up the interrupt handler in the right order.
- *
- * If you want to set up a threaded irq handler for your device
- * then you need to supply @handler and @thread_fn. @handler is
- * still called in hard interrupt context and has to check
- * whether the interrupt originates from the device. If yes it
- * needs to disable the interrupt on the device and return
- * IRQ_WAKE_THREAD which will wake up the handler thread and run
- * @thread_fn. This split handler design is necessary to support
- * shared interrupts.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If your interrupt is shared you must pass a non NULL dev_id
- * as this is required when freeing the interrupt.
- *
- * Flags:
+ * request_threaded_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. From the point this call is made your handler function
+ * may be invoked. Since your handler function must clear any interrupt the
+ * board raises, you must take care both to initialise your hardware and to
+ * set up the interrupt handler in the right order.
+ *
+ * If you want to set up a threaded irq handler for your device then you
+ * need to supply @handler and @thread_fn. @handler is still called in hard
+ * interrupt context and has to check whether the interrupt originates from
+ * the device. If yes it needs to disable the interrupt on the device and
+ * return IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support shared
+ * interrupts.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id as this is
+ * required when freeing the interrupt.
+ *
+ * Flags:
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
@@ -2201,21 +2179,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL(request_threaded_irq);
/**
- * request_any_context_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @flags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It selects either a
- * hardirq or threaded handling method depending on the
- * context.
- *
- * On failure, it returns a negative value. On success,
- * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ * request_any_context_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @flags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It selects either a hardirq or threaded handling
+ * method depending on the context.
+ *
+ * Returns: On failure, it returns a negative value. On success, it returns either
+ * IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
*/
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id)
@@ -2242,37 +2219,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(request_any_context_irq);
/**
- * request_nmi - allocate an interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @irqflags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It sets up the IRQ line
- * to be handled as an NMI.
- *
- * An interrupt line delivering NMIs cannot be shared and IRQ handling
- * cannot be threaded.
- *
- * Interrupt lines requested for NMI delivering must produce per cpu
- * interrupts and have auto enabling setting disabled.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail and return a negative value.
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It sets up the IRQ line to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * and return a negative value.
*/
int request_nmi(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *name, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (irq == IRQ_NOTCONNECTED)
@@ -2314,21 +2289,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
- /* Setup NMI state */
- desc->istate |= IRQS_NMI;
- retval = irq_nmi_setup(desc);
- if (retval) {
- __cleanup_nmi(irq, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return -EINVAL;
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ return -EINVAL;
+ }
+ return 0;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return 0;
-
err_irq_setup:
irq_chip_pm_put(&desc->irq_data);
err_out:
@@ -2339,35 +2310,25 @@ err_out:
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-
- if (!desc)
- return;
-
- /*
- * If the trigger type is not specified by the caller, then
- * use the default for this interrupt.
- */
- type &= IRQ_TYPE_SENSE_MASK;
- if (type == IRQ_TYPE_NONE)
- type = irqd_get_trigger_type(&desc->irq_data);
-
- if (type != IRQ_TYPE_NONE) {
- int ret;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ struct irq_desc *desc = scoped_irqdesc;
- ret = __irq_set_trigger(desc, type);
-
- if (ret) {
- WARN(1, "failed to set type for IRQ%d\n", irq);
- goto out;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
+ if (type != IRQ_TYPE_NONE) {
+ if (__irq_set_trigger(desc, type)) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ return;
+ }
}
+ irq_percpu_enable(desc, smp_processor_id());
}
-
- irq_percpu_enable(desc, cpu);
-out:
- irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
@@ -2385,33 +2346,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type)
*/
bool irq_percpu_is_enabled(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- struct irq_desc *desc;
- unsigned long flags;
- bool is_enabled;
-
- desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return false;
-
- is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
- irq_put_desc_unlock(desc, flags);
-
- return is_enabled;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled);
+ return false;
}
EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
void disable_percpu_irq(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-
- if (!desc)
- return;
-
- irq_percpu_disable(desc, cpu);
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ irq_percpu_disable(scoped_irqdesc, smp_processor_id());
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
@@ -2426,72 +2370,60 @@ void disable_percpu_nmi(unsigned int irq)
static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- unsigned long flags;
+ struct irqaction *action, **action_ptr;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
return NULL;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ action_ptr = &desc->action;
+ for (;;) {
+ action = *action_ptr;
- action = desc->action;
- if (!action || action->percpu_dev_id != dev_id) {
- WARN(1, "Trying to free already-free IRQ %d\n", irq);
- goto bad;
- }
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ return NULL;
+ }
- if (!cpumask_empty(desc->percpu_enabled)) {
- WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
- irq, cpumask_first(desc->percpu_enabled));
- goto bad;
- }
+ if (action->percpu_dev_id == dev_id)
+ break;
- /* Found it - now remove it from the list of entries: */
- desc->action = NULL;
+ action_ptr = &action->next;
+ }
- desc->istate &= ~IRQS_NMI;
+ if (cpumask_intersects(desc->percpu_enabled, action->affinity)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq,
+ cpumask_first_and(desc->percpu_enabled, action->affinity));
+ return NULL;
+ }
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ /* Found it - now remove it from the list of entries: */
+ *action_ptr = action->next;
- unregister_handler_proc(irq, action);
+ /* Demote from NMI if we killed the last action */
+ if (!desc->action)
+ desc->istate &= ~IRQS_NMI;
+ }
+ unregister_handler_proc(irq, action);
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
-
-bad:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return NULL;
}
/**
- * remove_percpu_irq - free a per-cpu interrupt
- * @irq: Interrupt line to free
- * @act: irqaction for the interrupt
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Used to remove interrupts statically setup by the early boot process.
- */
-void remove_percpu_irq(unsigned int irq, struct irqaction *act)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (desc && irq_settings_is_per_cpu_devid(desc))
- __free_percpu_irq(irq, act->percpu_dev_id);
-}
-
-/**
- * free_percpu_irq - free an interrupt allocated with request_percpu_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove a percpu interrupt handler. The handler is removed, but
- * the interrupt line is not disabled. This must be done on each
- * CPU before calling this function. The function does not return
- * until any executing interrupts for this IRQ have completed.
+ * Remove a percpu interrupt handler. The handler is removed, but the
+ * interrupt line is not disabled. This must be done on each CPU before
+ * calling this function. The function does not return until any executing
+ * interrupts for this IRQ have completed.
*
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*/
void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
@@ -2513,16 +2445,16 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return;
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (WARN_ON(!irq_is_nmi(desc)))
return;
kfree(__free_percpu_irq(irq, dev_id));
}
/**
- * setup_percpu_irq - setup a per-cpu interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
*
* Used to statically setup per-cpu interrupts in the early boot process.
*/
@@ -2546,26 +2478,57 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
return retval;
}
+static
+struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags,
+ const char *devname, const cpumask_t *affinity,
+ void __percpu *dev_id)
+{
+ struct irqaction *action;
+
+ if (!affinity)
+ affinity = cpu_possible_mask;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return NULL;
+
+ action->handler = handler;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->name = devname;
+ action->percpu_dev_id = dev_id;
+ action->affinity = affinity;
+
+ /*
+ * We allow some form of sharing for non-overlapping affinity
+ * masks. Obviously, covering all CPUs prevents any sharing in
+ * the first place.
+ */
+ if (!cpumask_equal(affinity, cpu_possible_mask))
+ action->flags |= IRQF_SHARED;
+
+ return action;
+}
+
/**
- * __request_percpu_irq - allocate a percpu interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @flags: Interrupt type flags (IRQF_TIMER only)
- * @devname: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt on the local CPU. If the interrupt is supposed to be
- * enabled on other CPUs, it has to be done on each CPU using
- * enable_percpu_irq().
- *
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * __request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
+ * @devname: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources, but doesn't enable the interrupt
+ * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN.
+ * It has to be done on each CPU using enable_percpu_irq().
+ *
+ * @dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
*/
int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
- void __percpu *dev_id)
+ const cpumask_t *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -2582,15 +2545,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
if (flags && flags != IRQF_TIMER)
return -EINVAL;
- action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id);
if (!action)
return -ENOMEM;
- action->handler = handler;
- action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
- action->name = devname;
- action->percpu_dev_id = dev_id;
-
retval = irq_chip_pm_get(&desc->irq_data);
if (retval < 0) {
kfree(action);
@@ -2609,32 +2567,32 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
- * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @name: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
- *
- * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
- * have to be setup on each CPU by calling prepare_percpu_nmi() before
- * being enabled on the same CPU by using enable_percpu_nmi().
- *
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
- *
- * Interrupt lines requested for NMI delivering should have auto enabling
- * setting disabled.
- *
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
+ *
+ * @dev_id must be globally unique. It is a per-cpu variable, and the
+ * handler gets called with the interrupted CPU's instance of that
+ * variable.
+ *
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
*/
-int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
- const char *name, void __percpu *dev_id)
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+ const struct cpumask *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (!handler)
@@ -2648,20 +2606,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
!irq_supports_nmi(desc))
return -EINVAL;
- /* The line cannot already be NMI */
- if (desc->istate & IRQS_NMI)
+ /* The line cannot be NMI already if the new request covers all CPUs */
+ if (irq_is_nmi(desc) &&
+ (!affinity || cpumask_equal(affinity, cpu_possible_mask)))
return -EINVAL;
- action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING,
+ name, affinity, dev_id);
if (!action)
return -ENOMEM;
- action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
- | IRQF_NOBALANCING;
- action->name = name;
- action->percpu_dev_id = dev_id;
-
retval = irq_chip_pm_get(&desc->irq_data);
if (retval < 0)
goto err_out;
@@ -2670,10 +2624,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->istate |= IRQS_NMI;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc->istate |= IRQS_NMI;
return 0;
err_irq_setup:
@@ -2685,83 +2637,58 @@ err_out:
}
/**
- * prepare_percpu_nmi - performs CPU local setup for NMI delivery
- * @irq: Interrupt line to prepare for NMI delivery
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
*
- * This call prepares an interrupt line to deliver NMI on the current CPU,
- * before that interrupt line gets enabled with enable_percpu_nmi().
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * returning a negative value.
*/
int prepare_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
- int ret = 0;
+ int ret = -EINVAL;
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return -EINVAL;
-
- if (WARN(!(desc->istate & IRQS_NMI),
- KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
- irq)) {
- ret = -EINVAL;
- goto out;
- }
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN(!irq_is_nmi(scoped_irqdesc),
+ "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq))
+ return -EINVAL;
- ret = irq_nmi_setup(desc);
- if (ret) {
- pr_err("Failed to setup NMI delivery: irq %u\n", irq);
- goto out;
+ ret = irq_nmi_setup(scoped_irqdesc);
+ if (ret)
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
}
-
-out:
- irq_put_desc_unlock(desc, flags);
return ret;
}
/**
- * teardown_percpu_nmi - undoes NMI setup of IRQ line
- * @irq: Interrupt line from which CPU local NMI configuration should be
- * removed
- *
- * This call undoes the setup done by prepare_percpu_nmi().
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be removed
*
- * IRQ line should not be enabled for the current CPU.
+ * This call undoes the setup done by prepare_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * IRQ line should not be enabled for the current CPU.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*/
void teardown_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
-
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return;
-
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
- goto out;
-
- irq_nmi_teardown(desc);
-out:
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN_ON(!irq_is_nmi(scoped_irqdesc)))
+ return;
+ irq_nmi_teardown(scoped_irqdesc);
+ }
}
-int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
- bool *state)
+static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
{
struct irq_chip *chip;
int err = -EINVAL;
@@ -2785,87 +2712,62 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
}
/**
- * irq_get_irqchip_state - returns the irqchip state of a interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be stored
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be stored
*
- * This call snapshots the internal irqchip state of an
- * interrupt, returning into @state the bit corresponding to
- * stage @which
+ * This call snapshots the internal irqchip state of an interrupt,
+ * returning into @state the bit corresponding to stage @which
*
- * This function should be called with preemption disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with preemption disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool *state)
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state)
{
- struct irq_desc *desc;
- struct irq_data *data;
- unsigned long flags;
- int err = -EINVAL;
-
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
- data = irq_desc_get_irq_data(desc);
-
- err = __irq_get_irqchip_state(data, which, state);
-
- irq_put_desc_busunlock(desc, flags);
- return err;
+ return __irq_get_irqchip_state(data, which, state);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
/**
- * irq_set_irqchip_state - set the state of a forwarded interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: State to be restored (one of IRQCHIP_STATE_*)
- * @val: Value corresponding to @which
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
*
- * This call sets the internal irqchip state of an interrupt,
- * depending on the value of @which.
+ * This call sets the internal irqchip state of an interrupt, depending on
+ * the value of @which.
*
- * This function should be called with migration disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with migration disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool val)
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val)
{
- struct irq_desc *desc;
- struct irq_data *data;
- struct irq_chip *chip;
- unsigned long flags;
- int err = -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
+ struct irq_chip *chip;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ do {
+ chip = irq_data_get_irq_chip(data);
- data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
- do {
- chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip)) {
- err = -ENODEV;
- goto out_unlock;
- }
- if (chip->irq_set_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ if (chip->irq_set_irqchip_state)
+ break;
- if (data)
- err = chip->irq_set_irqchip_state(data, which, val);
+ data = irqd_get_parent_data(data);
+ } while (data);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return err;
+ if (data)
+ return chip->irq_set_irqchip_state(data, which, val);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index bbfb26489aa1..8f222d1cccec 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -8,8 +8,6 @@
#include <linux/cpu.h>
#include <linux/irq.h>
-#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS))
-
struct cpumap {
unsigned int available;
unsigned int allocated;
@@ -17,8 +15,8 @@ struct cpumap {
unsigned int managed_allocated;
bool initialized;
bool online;
- unsigned long alloc_map[IRQ_MATRIX_SIZE];
- unsigned long managed_map[IRQ_MATRIX_SIZE];
+ unsigned long *managed_map;
+ unsigned long alloc_map[];
};
struct irq_matrix {
@@ -32,8 +30,8 @@ struct irq_matrix {
unsigned int total_allocated;
unsigned int online_maps;
struct cpumap __percpu *maps;
- unsigned long scratch_map[IRQ_MATRIX_SIZE];
- unsigned long system_map[IRQ_MATRIX_SIZE];
+ unsigned long *system_map;
+ unsigned long scratch_map[];
};
#define CREATE_TRACE_POINTS
@@ -50,24 +48,32 @@ __init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
unsigned int alloc_start,
unsigned int alloc_end)
{
+ unsigned int cpu, matrix_size = BITS_TO_LONGS(matrix_bits);
struct irq_matrix *m;
- if (matrix_bits > IRQ_MATRIX_BITS)
- return NULL;
-
- m = kzalloc(sizeof(*m), GFP_KERNEL);
+ m = kzalloc(struct_size(m, scratch_map, matrix_size * 2), GFP_KERNEL);
if (!m)
return NULL;
+ m->system_map = &m->scratch_map[matrix_size];
+
m->matrix_bits = matrix_bits;
m->alloc_start = alloc_start;
m->alloc_end = alloc_end;
m->alloc_size = alloc_end - alloc_start;
- m->maps = alloc_percpu(*m->maps);
+ m->maps = __alloc_percpu(struct_size(m->maps, alloc_map, matrix_size * 2),
+ __alignof__(*m->maps));
if (!m->maps) {
kfree(m);
return NULL;
}
+
+ for_each_possible_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ cm->managed_map = &cm->alloc_map[matrix_size];
+ }
+
return m;
}
@@ -286,7 +292,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
unsigned int *mapped_cpu)
{
- unsigned int bit, cpu, end = m->alloc_end;
+ unsigned int bit, cpu, end;
struct cpumap *cm;
if (cpumask_empty(msk))
@@ -466,16 +472,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
}
/**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
* @m: Pointer to the matrix to search
*
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
*/
unsigned int irq_matrix_allocated(struct irq_matrix *m)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return cm->allocated;
+ return cm->allocated - cm->managed_allocated;
}
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 61ca924ef4b4..f2b2929986ff 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -26,7 +26,7 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
* The outgoing CPU might be the last online target in a pending
* interrupt move. If that's the case clear the pending move bit.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
irqd_clr_move_pending(data);
return false;
}
@@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
return true;
}
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = irqd_get_parent_data(d)) {
+ if (d->chip && d->chip->irq_force_complete_move) {
+ d->chip->irq_force_complete_move(d);
+ return;
+ }
+ }
+}
+
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
@@ -74,7 +84,7 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+ if (cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
int ret;
ret = irq_do_set_affinity(data, desc->pending_mask, false);
@@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata)
if (!masked)
idata->chip->irq_unmask(idata);
}
+
+bool irq_can_move_in_process_context(struct irq_data *data)
+{
+ /*
+ * Get the top level irq_data in the hierarchy, which is optimized
+ * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled.
+ */
+ data = irq_desc_get_irq_data(irq_data_to_desc(data));
+ return irq_can_move_pcntxt(data);
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2bdfce5edafd..68886881fe10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -8,18 +8,59 @@
* This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
-#include <linux/types.h>
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/msi.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/sysfs.h>
-#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
#include "internals.h"
+/**
+ * struct msi_device_data - MSI per device data
+ * @properties: MSI properties which are interesting to drivers
+ * @mutex: Mutex protecting the MSI descriptor store
+ * @__domains: Internal data for per device MSI domains
+ * @__iter_idx: Index to search the next entry for iterators
+ */
+struct msi_device_data {
+ unsigned long properties;
+ struct mutex mutex;
+ struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS];
+ unsigned long __iter_idx;
+};
+
+/**
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
/**
* msi_alloc_desc - Allocate an initialized msi_desc
@@ -33,7 +74,7 @@ static inline int msi_sysfs_create_group(struct device *dev);
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+ const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -43,7 +84,7 @@ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
@@ -58,25 +99,56 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}
-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;
- desc->msi_index = index;
- ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
- if (ret)
- msi_free_desc(desc);
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
return ret;
}
/**
- * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
* @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
*
* Return: 0 on success or an appropriate failure code.
*/
-int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
{
struct msi_desc *desc;
@@ -88,40 +160,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;
- return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
-}
-
-/**
- * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
- * @dev: Pointer to the device for which the descriptors are allocated
- * @index: Index for the first MSI descriptor
- * @ndesc: Number of descriptors to allocate
- *
- * Return: 0 on success or an appropriate failure code.
- */
-static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
-{
- unsigned int idx, last = index + ndesc - 1;
- struct msi_desc *desc;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
- for (idx = index; idx <= last; idx++) {
- desc = msi_alloc_desc(dev, 1, NULL);
- if (!desc)
- goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, idx);
- if (ret)
- goto fail;
- }
- return 0;
-
-fail_mem:
- ret = -ENOMEM;
-fail:
- msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
- return ret;
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -138,28 +178,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
return false;
}
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
+/**
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
+ */
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
/**
- * msi_free_msi_descs_range - Free MSI descriptors of a device
- * @dev: Device to free the descriptors
- * @filter: Descriptor state filter
- * @first_index: Index to start freeing from
- * @last_index: Last index to be freed
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
*/
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int first_index, unsigned int last_index)
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
{
- struct xarray *xa = &dev->msi.data->__store;
struct msi_desc *desc;
- unsigned long idx;
+ unsigned int idx;
+ int ret;
lockdep_assert_held(&dev->msi.data->mutex);
- xa_for_each_range(xa, idx, desc, first_index, last_index) {
- if (msi_desc_match(desc, filter)) {
- xa_erase(xa, idx);
- msi_free_desc(desc);
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
}
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -178,9 +287,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg);
static void msi_device_data_release(struct device *dev, void *res)
{
struct msi_device_data *md = res;
+ int i;
- WARN_ON_ONCE(!xa_empty(&md->__store));
- xa_destroy(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
dev->msi.data = NULL;
}
@@ -197,7 +310,7 @@ static void msi_device_data_release(struct device *dev, void *res)
int msi_setup_device_data(struct device *dev)
{
struct msi_device_data *md;
- int ret;
+ int ret, i;
if (dev->msi.data)
return 0;
@@ -212,7 +325,18 @@ int msi_setup_device_data(struct device *dev)
return ret;
}
- xa_init(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
mutex_init(&md->mutex);
dev->msi.data = md;
devres_add(dev, md);
@@ -220,42 +344,49 @@ int msi_setup_device_data(struct device *dev)
}
/**
- * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * __msi_lock_descs - Lock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_lock_descs(struct device *dev)
+void __msi_lock_descs(struct device *dev)
{
mutex_lock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_lock_descs);
+EXPORT_SYMBOL_GPL(__msi_lock_descs);
/**
- * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * __msi_unlock_descs - Unlock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_unlock_descs(struct device *dev)
+void __msi_unlock_descs(struct device *dev)
{
- /* Invalidate the index wich was cached by the iterator */
- dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_unlock_descs);
+EXPORT_SYMBOL_GPL(__msi_unlock_descs);
-static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
{
+ struct xarray *xa = &md->__domains[domid].store;
struct msi_desc *desc;
- xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
if (msi_desc_match(desc, filter))
return desc;
}
- md->__iter_idx = MSI_MAX_INDEX;
+ md->__iter_idx = MSI_XA_MAX_INDEX;
return NULL;
}
/**
- * msi_first_desc - Get the first MSI descriptor of a device
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
* @filter: Descriptor state filter
*
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
@@ -264,23 +395,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_
* Return: Pointer to the first MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
md->__iter_idx = 0;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
-EXPORT_SYMBOL_GPL(msi_first_desc);
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
/**
* msi_next_desc - Get the next MSI descriptor of a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
*
* The first invocation of msi_next_desc() has to be preceeded by a
* successful invocation of __msi_first_desc(). Consecutive invocations are
@@ -290,11 +424,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc);
* Return: Pointer to the next MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
@@ -303,47 +438,51 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
return NULL;
md->__iter_idx++;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
EXPORT_SYMBOL_GPL(msi_next_desc);
/**
- * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
* @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
* @index: MSI interrupt index to look for (0-based)
*
* Return: The Linux interrupt number on success (> 0), 0 if not found
*/
-unsigned int msi_get_virq(struct device *dev, unsigned int index)
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
- unsigned int ret = 0;
- bool pcimsi;
+ bool pcimsi = false;
+ struct xarray *xa;
if (!dev->msi.data)
return 0;
- pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
- msi_lock_descs(dev);
- desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ guard(msi_descs_lock)(dev);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
/*
* PCI-MSI has only one descriptor for multiple interrupts.
* PCI-MSIX and platform MSI use a descriptor per
* interrupt.
*/
- if (pcimsi) {
- if (index < desc->nvec_used)
- ret = desc->irq + index;
- } else {
- ret = desc->irq;
- }
+ if (!pcimsi)
+ return desc->irq;
+ if (index < desc->nvec_used)
+ return desc->irq + index;
}
- msi_unlock_descs(dev);
- return ret;
+ return 0;
}
-EXPORT_SYMBOL_GPL(msi_get_virq);
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
#ifdef CONFIG_SYSFS
static struct attribute *msi_dev_attrs[] = {
@@ -420,7 +559,7 @@ fail:
return ret;
}
-#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
+#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN)
/**
* msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
* @dev: The device (PCI, platform etc) which will get sysfs entries
@@ -452,14 +591,46 @@ void msi_device_destroy_sysfs(struct device *dev)
msi_for_each_desc(desc, dev, MSI_DESC_ALL)
msi_sysfs_remove_desc(dev, desc);
}
-#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */
#else /* CONFIG_SYSFS */
static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
#endif /* !CONFIG_SYSFS */
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -535,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
int i, ret;
- if (irq_find_mapping(domain, hwirq) > 0)
+ if (irq_resolve_mapping(domain, hwirq))
return -EEXIST;
if (domain->parent) {
@@ -548,7 +719,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
if (ret < 0) {
if (ops->msi_free) {
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
ops->msi_free(domain, info, virq + i);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
@@ -572,11 +743,44 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
+static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fwspec,
+ irq_hw_number_t *hwirq, unsigned int *type)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * This will catch allocations through the regular irqdomain path except
+ * for MSI domains which really support this, e.g. MBIGEN.
+ */
+ if (!info->ops->msi_translate)
+ return -ENOTSUPP;
+ return info->ops->msi_translate(domain, fwspec, hwirq, type);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL;
+
+ if (!desc)
+ return;
+
+ seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi);
+ seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo);
+ seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data);
+}
+#endif
+
static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
.deactivate = msi_domain_deactivate,
+ .translate = msi_domain_translate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = msi_domain_debug_show,
+#endif
};
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
@@ -592,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
+{
+}
+
static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
struct msi_desc *desc)
{
@@ -613,21 +821,12 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
.msi_prepare = msi_domain_ops_prepare,
+ .msi_teardown = msi_domain_ops_teardown,
.set_desc = msi_domain_ops_set_desc,
- .domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -639,11 +838,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
- if (ops->domain_alloc_irqs == NULL)
- ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
-
if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -651,10 +845,10 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->msi_teardown == NULL)
+ ops->msi_teardown = msi_domain_ops_default.msi_teardown;
if (ops->set_desc == NULL)
ops->set_desc = msi_domain_ops_default.set_desc;
}
@@ -664,10 +858,45 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
struct irq_chip *chip = info->chip;
BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
- if (!chip->irq_set_affinity)
+ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY))
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain) {
+ irq_domain_update_bus_token(domain, info->bus_token);
+ domain->dev = info->dev;
+ if (info->flags & MSI_FLAG_PARENT_PM_DEV)
+ domain->pm_dev = parent->pm_dev;
+ }
+
+ return domain;
+}
+
/**
* msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
@@ -680,69 +909,260 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
- struct irq_domain *domain;
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+/**
+ * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain
+ * @info: MSI irqdomain creation info
+ * @msi_parent_ops: MSI parent callbacks and configuration
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
+ */
+struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info,
+ const struct msi_parent_ops *msi_parent_ops)
+{
+ struct irq_domain *d;
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ info->hwirq_max = max(info->hwirq_max, info->size);
+ info->size = info->hwirq_max;
+ info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ info->bus_token = msi_parent_ops->bus_select_token;
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ d = irq_domain_instantiate(info);
+ if (IS_ERR(d))
+ return NULL;
- return domain;
+ d->msi_parent_ops = msi_parent_ops;
+ return d;
}
+EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain);
-int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- int ret;
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
- return ret;
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ struct msi_domain_template *bundle __free(kfree) =
+ kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
+ bundle->info.dev = dev;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ /*
+ * Using the device firmware node is required for wire to MSI
+ * device domains so that the existing firmware results in a domain
+ * match.
+ * All other device domains like PCI/MSI use the named firmware
+ * node as they are not guaranteed to have a fwnode. They are never
+ * looked up and always handled in the context of the device.
+ */
+ struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL;
+
+ if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE))
+ fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name);
+ else
+ fwnode = dev->fwnode;
+
+ if (!fwnode)
+ return false;
+
+ if (msi_setup_device_data(dev))
+ return false;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ return false;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ return false;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ return false;
+
+ dev->msi.data->__domains[domid].domain = domain;
+
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
+ /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
+ retain_and_null_ptr(bundle);
+ retain_and_null_ptr(fwnode_alloced);
+ return true;
}
-int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq_base, int nvec, msi_alloc_info_t *arg)
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- struct msi_desc *desc;
- int ret, virq;
+ struct fwnode_handle *fwnode = NULL;
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
- msi_lock_descs(dev);
- ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
- if (ret)
- goto unlock;
+ guard(msi_descs_lock)(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain || !irq_domain_is_msi_device(domain))
+ return;
- for (virq = virq_base; virq < virq_base + nvec; virq++) {
- desc = xa_load(&dev->msi.data->__store, virq);
- desc->irq = virq;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
- ops->set_desc(arg, desc);
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
- if (ret)
- goto fail;
+ info->ops->msi_teardown(domain, info->alloc_data);
+
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
+ irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
+ kfree(container_of(info, struct msi_domain_template, info));
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
- irq_set_msi_desc(virq, desc);
+ guard(msi_descs_lock)(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ return info->bus_token == bus_token;
}
- msi_unlock_descs(dev);
- return 0;
+ return false;
+}
-fail:
- for (--virq; virq >= virq_base; virq--)
- irq_domain_free_irqs_common(domain, virq, 1);
- msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
-unlock:
- msi_unlock_descs(dev);
- return ret;
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
/*
@@ -764,6 +1184,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
break;
default:
@@ -773,7 +1195,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
return false;
/*
@@ -789,6 +1211,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
{
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
if (IS_ENABLED(CONFIG_PCI_MSI))
break;
@@ -807,7 +1231,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
#define VIRQ_CAN_RESERVE 0x01
#define VIRQ_ACTIVATE 0x02
-#define VIRQ_NOMASK_QUIRK 0x04
static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
{
@@ -816,8 +1239,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
if (!(vflags & VIRQ_CAN_RESERVE)) {
irqd_clr_can_reserve(irqd);
- if (vflags & VIRQ_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irqd);
+
+ /*
+ * If the interrupt is managed but no CPU is available to
+ * service it, shut it down until better times. Note that
+ * we only do this on the !RESERVE path as x86 (the only
+ * architecture using this flag) deals with this in a
+ * different way by using a catch-all vector.
+ */
+ if ((vflags & VIRQ_ACTIVATE) &&
+ irqd_affinity_is_managed(irqd) &&
+ !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+ cpu_online_mask)) {
+ irqd_set_managed_shutdown(irqd);
+ return 0;
+ }
}
if (!(vflags & VIRQ_ACTIVATE))
@@ -835,18 +1271,37 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
+ struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ unsigned int vflags = 0, allocated = 0;
msi_alloc_info_t arg = { };
- unsigned int vflags = 0;
struct msi_desc *desc;
- int allocated = 0;
+ unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -862,17 +1317,20 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* Interrupt can use a reserved vector and will not occupy
* a real device vector until the interrupt is requested.
*/
- if (msi_check_reservation_mode(domain, info, dev)) {
+ if (msi_check_reservation_mode(domain, info, dev))
vflags |= VIRQ_CAN_RESERVE;
- /*
- * MSI affinity setting requires a special quirk (X86) when
- * reservation mode is active.
- */
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- vflags |= VIRQ_NOMASK_QUIRK;
- }
- msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -898,76 +1356,259 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return 0;
}
-static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
- struct device *dev,
- unsigned int num_descs)
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
{
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
return 0;
- return msi_add_simple_msi_descs(dev, 0, num_descs);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
+
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
+
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
+ return ret;
}
/**
- * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
- * pair. Use this for MSI irqdomains which implement their own vector
+ * pair. Use this for MSI irqdomains which implement their own descriptor
* allocation/free.
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
- ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
- if (ret)
- return ret;
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
- ret = ops->domain_alloc_irqs(domain, dev, nvec);
- if (ret)
- msi_domain_free_irqs_descs_locked(domain, dev);
- return ret;
+ guard(msi_descs_lock)(dev);
+ return msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
}
+EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+static struct msi_map __msi_domain_alloc_irq_at(struct device *dev, unsigned int domid,
+ unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
+ struct msi_desc *desc;
int ret;
- msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
- msi_unlock_descs(dev);
- return ret;
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ return map;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ return map;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ return map;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+ return map;
}
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
{
+ guard(msi_descs_lock)(dev);
+ return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
+}
+
+/**
+ * msi_device_domain_alloc_wired - Allocate a "wired" interrupt on @domain
+ * @domain: The domain to allocate on
+ * @hwirq: The hardware interrupt number to allocate for
+ * @type: The interrupt type
+ *
+ * This weirdness supports wire to MSI controllers like MBIGEN.
+ *
+ * @hwirq is the hardware interrupt number which is handed in from
+ * irq_create_fwspec_mapping(). As the wire to MSI domain is sparse, but
+ * sized in firmware, the hardware interrupt number cannot be used as MSI
+ * index. For the underlying irq chip the MSI index is irrelevant and
+ * all it needs is the hardware interrupt number.
+ *
+ * To handle this the MSI index is allocated with MSI_ANY_INDEX and the
+ * hardware interrupt number is stored along with the type information in
+ * msi_desc::cookie so the underlying interrupt chip and domain code can
+ * retrieve it.
+ *
+ * Return: The Linux interrupt number (> 0) or an error code
+ */
+int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
+ unsigned int type)
+{
+ unsigned int domid = MSI_DEFAULT_DOMAIN;
+ union msi_instance_cookie icookie = { };
+ struct device *dev = domain->dev;
+ struct msi_map map = { };
+
+ if (WARN_ON_ONCE(!dev || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return -EINVAL;
+
+ icookie.value = ((u64)type << 32) | hwirq;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
+ map.index = -EINVAL;
+ else
+ map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
+ return map.index >= 0 ? map.virq : map.index;
+}
+
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;
- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -982,45 +1623,119 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
}
-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
}
/**
- * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ guard(msi_descs_lock)(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+}
+EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation.
*/
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- ops->domain_free_irqs(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: The id of the domain to operate on
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
- msi_lock_descs(dev);
- msi_domain_free_irqs_descs_locked(domain, dev);
- msi_unlock_descs(dev);
+ guard(msi_descs_lock)(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+}
+
+/**
+ * msi_device_domain_free_wired - Free a wired interrupt in @domain
+ * @domain: The domain to free the interrupt on
+ * @virq: The Linux interrupt number to free
+ *
+ * This is the counterpart of msi_device_domain_alloc_wired() for the
+ * weird wired to MSI converting domains.
+ */
+void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
+{
+ struct msi_desc *desc = irq_get_msi_desc(virq);
+ struct device *dev = domain->dev;
+
+ if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain))
+ return;
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
}
/**
@@ -1034,4 +1749,29 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
return (struct msi_domain_info *)domain->host_data;
}
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index ca71123a6130..99ff65466d87 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -13,17 +13,13 @@
#include "internals.h"
-bool irq_pm_check_wakeup(struct irq_desc *desc)
+void irq_pm_handle_wakeup(struct irq_desc *desc)
{
- if (irqd_is_wakeup_armed(&desc->irq_data)) {
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
- desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
- desc->depth++;
- irq_disable(desc);
- pm_system_irq_wakeup(irq_desc_get_irq(desc));
- return true;
- }
- return false;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
}
/*
@@ -46,8 +42,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
- (desc->no_suspend_depth +
- desc->cond_suspend_depth) != desc->nr_actions);
+ (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions);
}
/*
@@ -134,20 +129,17 @@ void suspend_device_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
bool sync;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ sync = suspend_device_irq(desc);
if (sync)
synchronize_irq(irq);
}
}
-EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
@@ -187,18 +179,15 @@ static void resume_irqs(bool want_early)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
- bool is_early = desc->action &&
- desc->action->flags & IRQF_EARLY_RESUME;
+ bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME;
if (!is_early && want_early)
continue;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
resume_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -208,41 +197,40 @@ static void resume_irqs(bool want_early)
*/
void rearm_wake_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
-
- if (!(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
- goto unlock;
+ if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data))
+ return;
- desc->istate &= ~IRQS_SUSPENDED;
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
- __enable_irq(desc);
-
-unlock:
- irq_put_desc_busunlock(desc, flags);
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+ }
}
/**
* irq_pm_syscore_resume - enable interrupt lines early
+ * @data: syscore context
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
-static void irq_pm_syscore_resume(void)
+static void irq_pm_syscore_resume(void *data)
{
resume_irqs(true);
}
-static struct syscore_ops irq_pm_syscore_ops = {
+static const struct syscore_ops irq_pm_syscore_ops = {
.resume = irq_pm_syscore_resume,
};
+static struct syscore irq_pm_syscore = {
+ .ops = &irq_pm_syscore_ops,
+};
+
static int __init irq_pm_init_ops(void)
{
- register_syscore_ops(&irq_pm_syscore_ops);
+ register_syscore(&irq_pm_syscore);
return 0;
}
@@ -259,4 +247,3 @@ void resume_device_irqs(void)
{
resume_irqs(false);
}
-EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 623b8136e9af..77258eafbf63 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -48,14 +48,14 @@ static int show_irq_affinity(int type, struct seq_file *m)
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask;
+ guard(raw_spinlock_irq)(&desc->lock);
+
switch (type) {
case AFFINITY:
case AFFINITY_LIST:
mask = desc->irq_common_data.affinity;
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (irqd_is_setaffinity_pending(&desc->irq_data))
- mask = desc->pending_mask;
-#endif
+ if (irq_move_pending(&desc->irq_data))
+ mask = irq_desc_get_pending_mask(desc);
break;
case EFFECTIVE:
case EFFECTIVE_LIST:
@@ -83,20 +83,18 @@ static int show_irq_affinity(int type, struct seq_file *m)
static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- unsigned long flags;
cpumask_var_t mask;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (desc->affinity_hint)
- cpumask_copy(mask, desc->affinity_hint);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (desc->affinity_hint)
+ cpumask_copy(mask, desc->affinity_hint);
+ }
seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
-
return 0;
}
@@ -142,7 +140,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
int err;
if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
- return -EIO;
+ return -EPERM;
if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
@@ -297,32 +295,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
#define MAX_NAMELEN 128
-static int name_unique(unsigned int irq, struct irqaction *new_action)
+static bool name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
- int ret = 1;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name)) {
- ret = 0;
- break;
- }
+ !strcmp(new_action->name, action->name))
+ return false;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ return true;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
- char name [MAX_NAMELEN];
+ char name[MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc->dir || action->dir || !action->name ||
- !name_unique(irq, action))
+ if (!desc->dir || action->dir || !action->name || !name_unique(irq, action))
return;
snprintf(name, MAX_NAMELEN, "%s", action->name);
@@ -349,45 +341,45 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
* added, not when the descriptor is created, so multiple
* tasks might try to register at the same time.
*/
- mutex_lock(&register_lock);
+ guard(mutex)(&register_lock);
if (desc->dir)
- goto out_unlock;
-
- sprintf(name, "%d", irq);
+ return;
/* create /proc/irq/1234 */
+ sprintf(name, "%u", irq);
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
- goto out_unlock;
+ return;
#ifdef CONFIG_SMP
+ umode_t umode = S_IRUGO;
+
+ if (irq_can_set_affinity_usr(desc->irq_data.irq))
+ umode |= S_IWUSR;
+
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0644, desc->dir,
- &irq_affinity_proc_ops, irqp);
+ proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_single_data("affinity_hint", 0444, desc->dir,
- irq_affinity_hint_proc_show, irqp);
+ irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
- proc_create_data("smp_affinity_list", 0644, desc->dir,
+ proc_create_data("smp_affinity_list", umode, desc->dir,
&irq_affinity_list_proc_ops, irqp);
- proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
- irqp);
+ proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp);
# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
proc_create_single_data("effective_affinity", 0444, desc->dir,
- irq_effective_aff_proc_show, irqp);
+ irq_effective_aff_proc_show, irqp);
proc_create_single_data("effective_affinity_list", 0444, desc->dir,
- irq_effective_aff_list_proc_show, irqp);
+ irq_effective_aff_list_proc_show, irqp);
# endif
#endif
proc_create_single_data("spurious", 0444, desc->dir,
- irq_spurious_proc_show, (void *)(long)irq);
+ irq_spurious_proc_show, (void *)(long)irq);
-out_unlock:
- mutex_unlock(&register_lock);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -454,14 +446,14 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec)
}
#ifndef ACTUAL_NR_IRQS
-# define ACTUAL_NR_IRQS nr_irqs
+# define ACTUAL_NR_IRQS irq_get_nr_irqs()
#endif
int show_interrupts(struct seq_file *p, void *v)
{
+ const unsigned int nr_irqs = irq_get_nr_irqs();
static int prec;
- unsigned long flags, any_count = 0;
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
@@ -483,34 +475,32 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
- rcu_read_lock();
+ guard(rcu)();
desc = irq_to_desc(i);
if (!desc || irq_settings_is_hidden(desc))
- goto outsparse;
+ return 0;
- if (desc->kstat_irqs) {
- for_each_online_cpu(j)
- any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
- }
+ if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
+ return 0;
- if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
- goto outsparse;
+ seq_printf(p, "%*d:", prec, i);
+ for_each_online_cpu(j) {
+ unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0;
- seq_printf(p, "%*d: ", prec, i);
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, j) : 0);
+ seq_put_decimal_ull_width(p, " ", cnt, 10);
+ }
+ seq_putc(p, ' ');
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
- seq_printf(p, " %8s", desc->irq_data.chip->name);
+ seq_printf(p, "%8s", desc->irq_data.chip->name);
else
- seq_printf(p, " %8s", "-");
+ seq_printf(p, "%8s", "-");
} else {
- seq_printf(p, " %8s", "None");
+ seq_printf(p, "%8s", "None");
}
if (desc->irq_data.domain)
seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
@@ -530,9 +520,6 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, '\n');
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-outsparse:
- rcu_read_unlock();
return 0;
}
#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 0c46e9fe3a89..ca9cc1b806a9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -21,26 +21,25 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
-/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+/* hlist_head to handle software resend of interrupts: */
+static HLIST_HEAD(irq_resend_list);
+static DEFINE_RAW_SPINLOCK(irq_resend_lock);
/*
* Run software resends of IRQ's
*/
static void resend_irqs(struct tasklet_struct *unused)
{
- struct irq_desc *desc;
- int irq;
+ guard(raw_spinlock_irq)(&irq_resend_lock);
+ while (!hlist_empty(&irq_resend_list)) {
+ struct irq_desc *desc;
- while (!bitmap_empty(irqs_resend, nr_irqs)) {
- irq = find_first_bit(irqs_resend, nr_irqs);
- clear_bit(irq, irqs_resend);
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
- local_irq_disable();
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node);
+ hlist_del_init(&desc->resend_node);
+
+ raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
- local_irq_enable();
+ raw_spin_lock(&irq_resend_lock);
}
}
@@ -49,13 +48,11 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
- unsigned int irq = irq_desc_get_irq(desc);
-
/*
* Validate whether this interrupt can be safely injected from
* non interrupt context
*/
- if (handle_enforce_irqctx(&desc->irq_data))
+ if (irqd_is_handle_enforce_irqctx(&desc->irq_data))
return -EINVAL;
/*
@@ -70,16 +67,35 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
- irq = desc->parent_irq;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
}
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
+ /* Add to resend_list and activate the softirq: */
+ scoped_guard(raw_spinlock, &irq_resend_lock) {
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ }
tasklet_schedule(&resend_tasklet);
return 0;
}
+void clear_irq_resend(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&irq_resend_lock);
+ hlist_del_init(&desc->resend_node);
+}
+
+void irq_resend_init(struct irq_desc *desc)
+{
+ INIT_HLIST_NODE(&desc->resend_node);
+}
#else
+void clear_irq_resend(struct irq_desc *desc) {}
+void irq_resend_init(struct irq_desc *desc) {}
+
static int irq_sw_resend(struct irq_desc *desc)
{
return -EINVAL;
@@ -154,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
*/
int irq_inject_interrupt(unsigned int irq)
{
- struct irq_desc *desc;
- unsigned long flags;
- int err;
+ int err = -EINVAL;
/* Try the state injection hardware interface first */
if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true))
return 0;
/* That failed, try via the resend mechanism */
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
-
- /*
- * Only try to inject when the interrupt is:
- * - not NMI type
- * - activated
- */
- if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data))
- err = -EINVAL;
- else
- err = check_irq_resend(desc, true);
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- irq_put_desc_busunlock(desc, flags);
+ /*
+ * Only try to inject when the interrupt is:
+ * - not NMI type
+ * - activated
+ */
+ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data))
+ err = check_irq_resend(desc, true);
+ }
return err;
}
EXPORT_SYMBOL_GPL(irq_inject_interrupt);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 7b7efb1a114b..00b3bd127692 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -11,7 +11,6 @@ enum {
_IRQ_NOREQUEST = IRQ_NOREQUEST,
_IRQ_NOTHREAD = IRQ_NOTHREAD,
_IRQ_NOAUTOEN = IRQ_NOAUTOEN,
- _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
@@ -142,11 +141,6 @@ static inline void irq_settings_set_noprobe(struct irq_desc *desc)
desc->status_use_accessors |= _IRQ_NOPROBE;
}
-static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
-{
- return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
-}
-
static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..73280ccb74b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -19,77 +19,41 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
-static int irq_poll_cpu;
+int irq_poll_cpu;
static atomic_t irq_poll_active;
/*
- * We wait here for a poller to finish.
- *
- * If the poll runs on this CPU, then we yell loudly and return
- * false. That will leave the interrupt line disabled in the worst
- * case, but it should never happen.
- *
- * We wait until the poller is done and then recheck disabled and
- * action (about to be disabled). Only if it's still active, we return
- * true and let the handler run.
- */
-bool irq_wait_for_poll(struct irq_desc *desc)
- __must_hold(&desc->lock)
-{
- if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
- "irq poll in progress on cpu %d for irq %d\n",
- smp_processor_id(), desc->irq_data.irq))
- return false;
-
-#ifdef CONFIG_SMP
- do {
- raw_spin_unlock(&desc->lock);
- while (irqd_irq_inprogress(&desc->irq_data))
- cpu_relax();
- raw_spin_lock(&desc->lock);
- } while (irqd_irq_inprogress(&desc->irq_data));
- /* Might have been disabled in meantime */
- return !irqd_irq_disabled(&desc->irq_data) && desc->action;
-#else
- return false;
-#endif
-}
-
-
-/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(struct irq_desc *desc, bool force)
+static bool try_one_irq(struct irq_desc *desc, bool force)
{
- irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
+ bool ret = false;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
* PER_CPU, nested thread interrupts and interrupts explicitly
* marked polled are excluded from polling.
*/
- if (irq_settings_is_per_cpu(desc) ||
- irq_settings_is_nested_thread(desc) ||
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) ||
irq_settings_is_polled(desc))
- goto out;
+ return false;
/*
* Do not poll disabled interrupts unless the spurious
* disabled poller asks explicitly.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
- goto out;
+ return false;
/*
* All handlers must agree on IRQF_SHARED, so we test just the
* first.
*/
action = desc->action;
- if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER))
- goto out;
+ if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER))
+ return false;
/* Already running on another processor */
if (irqd_irq_inprogress(&desc->irq_data)) {
@@ -98,21 +62,19 @@ static int try_one_irq(struct irq_desc *desc, bool force)
* CPU to go looking for our mystery interrupt too
*/
desc->istate |= IRQS_PENDING;
- goto out;
+ return false;
}
/* Mark it poll in progress */
desc->istate |= IRQS_POLL_INPROGRESS;
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
- ret = IRQ_HANDLED;
+ ret = true;
/* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
-out:
- raw_spin_unlock(&desc->lock);
- return ret == IRQ_HANDLED;
+ return ret;
}
static int misrouted_irq(int irq)
@@ -157,8 +119,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
continue;
/* Racy but it doesn't matter */
- state = desc->istate;
- barrier();
+ state = READ_ONCE(desc->istate);
if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
@@ -168,8 +129,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
}
out:
atomic_dec(&irq_poll_active);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
static inline int bad_action_ret(irqreturn_t action_ret)
@@ -193,17 +153,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
- unsigned long flags;
- if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
- } else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
- "the \"irqpoll\" option)\n", irq);
- }
+ if (bad_action_ret(action_ret))
+ pr_err("irq event %d: bogus return value %x\n", irq, action_ret);
+ else
+ pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq);
dump_stack();
- printk(KERN_ERR "handlers:\n");
+ pr_err("handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
@@ -211,15 +167,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
- printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
+ pr_err("[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %ps",
- action->thread_fn, action->thread_fn);
- printk(KERN_CONT "\n");
+ pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn);
+ pr_cont("\n");
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
@@ -232,18 +186,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
}
}
-static inline int
-try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
- return 0;
+ return false;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
- return 1;
+ return true;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
@@ -251,19 +204,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
- return 0;
+ return false;
if (!irq)
- return 1;
+ return true;
/*
* Since we don't get the descriptor lock, "action" can
- * change under us. We don't really care, but we don't
- * want to follow a NULL pointer. So tell the compiler to
- * just load it once by using a barrier.
+ * change under us.
*/
- action = desc->action;
- barrier();
+ action = READ_ONCE(desc->action);
return action && (action->flags & IRQF_IRQPOLL);
}
@@ -273,8 +223,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq;
- if (desc->istate & IRQS_POLL_INPROGRESS ||
- irq_settings_is_polled(desc))
+ if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
@@ -420,13 +369,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
/*
* Now kill the IRQ
*/
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ pr_emerg("Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
@@ -436,11 +384,9 @@ bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
- printk(KERN_INFO "IRQ lockup detection disabled\n");
-
+ pr_info("IRQ lockup detection disabled\n");
return 1;
}
-
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
@@ -452,12 +398,10 @@ static int __init irqfixup_setup(char *str)
return 1;
}
irqfixup = 1;
- printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
- printk(KERN_WARNING "This may impact system performance.\n");
-
+ pr_warn("Misrouted IRQ fixup support enabled.\n");
+ pr_warn("This may impact system performance.\n");
return 1;
}
-
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
@@ -468,11 +412,8 @@ static int __init irqpoll_setup(char *str)
return 1;
}
irqfixup = 2;
- printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
- "enabled\n");
- printk(KERN_WARNING "This may significantly impact system "
- "performance\n");
+ pr_warn("Misrouted IRQ fixup and polling support enabled\n");
+ pr_warn("This may significantly impact system performance\n");
return 1;
}
-
__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c43e2ac2f8de..4b7315e99bd6 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -509,6 +509,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
/**
* irq_timings_next_event - Return when the next event is supposed to arrive
+ * @now: current time
*
* During the last busy cycle, the number of interrupts is incremented
* and stored in the irq_timings structure. This information is
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f7df715ec28e..73f7e1fd4ab4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -22,6 +22,8 @@
#include <asm/processor.h>
#include <linux/kasan.h>
+#include <trace/events/ipi.h>
+
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);
@@ -74,6 +76,14 @@ void __weak arch_irq_work_raise(void)
*/
}
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
@@ -99,7 +109,7 @@ static void __irq_work_queue_local(struct irq_work *work)
/* If the work is "lazy", handle it from next tick if any */
if (!lazy_work || tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ irq_work_raise(work);
}
/* Enqueue the irq work @work on the current CPU */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b156e152d6b4..7cb19e601426 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,51 +113,84 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc/dec() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update().
+ *
+ * The INT_MAX overflow condition is either used by the networking
+ * code to reset or detected in the slow path of
+ * static_key_slow_inc_cpuslocked().
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || v == INT_MAX)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
- * Careful if we get concurrent static_key_slow_inc() calls;
+ * Careful if we get concurrent static_key_slow_inc/dec() calls;
* later calls must wait for the first one to _finish_ the
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
- jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- atomic_set(&key->enabled, -1);
+ guard(mutex)(&jump_label_mutex);
+ /* Try to mark it as 'enabling in progress. */
+ if (!atomic_cmpxchg(&key->enabled, 0, -1)) {
jump_label_update(key);
/*
- * Ensure that if the above cmpxchg loop observes our positive
- * value, it must also observe all the text changes.
+ * Ensure that when static_key_fast_inc_not_disabled() or
+ * static_key_dec_not_one() observe the positive value,
+ * they must also observe all the text changes.
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ /*
+ * While holding the mutex this should never observe
+ * anything else than a value >= 1 and succeed
+ */
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key)))
+ return false;
}
- jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -203,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key)
}
jump_label_lock();
- if (atomic_cmpxchg(&key->enabled, 1, 0))
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
jump_label_update(key);
jump_label_unlock();
}
@@ -217,36 +250,69 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static bool static_key_slow_try_dec(struct static_key *key)
+static bool static_key_dec_not_one(struct static_key *key)
{
- int val;
-
- val = atomic_fetch_add_unless(&key->enabled, -1, 1);
- if (val == 1)
- return false;
+ int v;
/*
- * The negative count check is valid even when a negative
- * key->enabled is in use by static_key_slow_inc(); a
- * __static_key_slow_dec() before the first static_key_slow_inc()
- * returns is unbalanced, because all other static_key_slow_inc()
- * instances block while the update is in progress.
+ * Go into the slow path if key::enabled is less than or equal than
+ * one. One is valid to shut down the key, anything less than one
+ * is an imbalance, which is handled at the call site.
+ *
+ * That includes the special case of '-1' which is set in
+ * static_key_slow_inc_cpuslocked(), but that's harmless as it is
+ * fully serialized in the slow path below. By the time this task
+ * acquires the jump label lock the value is back to one and the
+ * retry under the lock must succeed.
*/
- WARN(val < 0, "jump label: negative count!\n");
+ v = atomic_read(&key->enabled);
+ do {
+ /*
+ * Warn about the '-1' case though; since that means a
+ * decrement is concurrent with a first (0->1) increment. IOW
+ * people are trying to disable something that wasn't yet fully
+ * enabled. This suggests an ordering problem on the user side.
+ */
+ WARN_ON_ONCE(v < 0);
+
+ /*
+ * Warn about underflow, and lie about success in an attempt to
+ * not make things worse.
+ */
+ if (WARN_ON_ONCE(v == 0))
+ return true;
+
+ if (v <= 1)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));
+
return true;
}
static void __static_key_slow_dec_cpuslocked(struct static_key *key)
{
lockdep_assert_cpus_held();
+ int val;
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
+ return;
+
+ guard(mutex)(&jump_label_mutex);
+ val = atomic_read(&key->enabled);
+ /*
+ * It should be impossible to observe -1 with jump_label_mutex held,
+ * see static_key_slow_inc_cpuslocked().
+ */
+ if (WARN_ON_ONCE(val == -1))
+ return;
+ /*
+ * Cannot already be 0, something went sideways.
+ */
+ if (WARN_ON_ONCE(val == 0))
return;
- jump_label_lock();
if (atomic_dec_and_test(&key->enabled))
jump_label_update(key);
- jump_label_unlock();
}
static void __static_key_slow_dec(struct static_key *key)
@@ -283,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key,
{
STATIC_KEY_CHECK_USE(key);
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
return;
schedule_delayed_work(work, timeout);
@@ -332,17 +398,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ /* nothing to do on most architectures */
}
+#endif
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
@@ -506,9 +568,48 @@ void __init jump_label_init(void)
cpus_read_unlock();
}
+static inline bool static_key_sealed(struct static_key *key)
+{
+ return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline void static_key_seal(struct static_key *key)
+{
+ unsigned long type = key->type & JUMP_TYPE_TRUE;
+ key->type = JUMP_TYPE_LINKED | type;
+}
+
+void jump_label_init_ro(void)
+{
+ struct jump_entry *iter_start = __start___jump_table;
+ struct jump_entry *iter_stop = __stop___jump_table;
+ struct jump_entry *iter;
+
+ if (WARN_ON_ONCE(!static_key_initialized))
+ return;
+
+ cpus_read_lock();
+ jump_label_lock();
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk = jump_entry_key(iter);
+
+ if (!is_kernel_ro_after_init((unsigned long)iterk))
+ continue;
+
+ if (static_key_sealed(iterk))
+ continue;
+
+ static_key_seal(iterk);
+ }
+
+ jump_label_unlock();
+ cpus_read_unlock();
+}
+
#ifdef CONFIG_MODULES
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
bool type = static_key_type(key);
@@ -552,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
struct module *mod;
int ret;
- preempt_disable();
- mod = __module_text_address((unsigned long)start);
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
- if (!try_module_get(mod))
- mod = NULL;
- preempt_enable();
-
+ scoped_guard(rcu) {
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ }
if (!mod)
return 0;
@@ -596,31 +696,6 @@ static void __jump_label_mod_update(struct static_key *key)
}
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
-
- for (iter = iter_start; iter < iter_stop; iter++) {
- /* Only write NOPs for arch_branch_static(). */
- if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
- arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- }
-}
-
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
@@ -651,6 +726,15 @@ static int jump_label_add_module(struct module *mod)
static_key_set_entries(key, iter);
continue;
}
+
+ /*
+ * If the key was sealed at init, then there's no need to keep a
+ * reference to its module entries - just patch them now and be
+ * done with it.
+ */
+ if (static_key_sealed(key))
+ goto do_poke;
+
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
@@ -661,9 +745,9 @@ static int jump_label_add_module(struct module *mod)
kfree(jlm);
return -ENOMEM;
}
- preempt_disable();
- jlm2->mod = __module_address((unsigned long)key);
- preempt_enable();
+ scoped_guard(rcu)
+ jlm2->mod = __module_address((unsigned long)key);
+
jlm2->entries = static_key_entries(key);
jlm2->next = NULL;
static_key_set_mod(key, jlm2);
@@ -676,6 +760,7 @@ static int jump_label_add_module(struct module *mod)
static_key_set_linked(key);
/* Only update if we've changed from our initial state */
+do_poke:
if (jump_label_type(iter) != jump_label_init_type(iter))
__jump_label_update(key, iter, iter_stop, true);
}
@@ -700,6 +785,10 @@ static void jump_label_del_module(struct module *mod)
if (within_module((unsigned long)key, mod))
continue;
+ /* No @jlm allocated because key was sealed at init. */
+ if (static_key_sealed(key))
+ continue;
+
/* No memory during module load */
if (WARN_ON(!static_key_linked(key)))
continue;
@@ -816,13 +905,13 @@ static void jump_label_update(struct static_key *key)
return;
}
- preempt_disable();
- mod = __module_address((unsigned long)key);
- if (mod) {
- stop = mod->jump_entries + mod->num_jump_entries;
- init = mod->state == MODULE_STATE_COMING;
+ scoped_guard(rcu) {
+ mod = __module_address((unsigned long)key);
+ if (mod) {
+ stop = mod->jump_entries + mod->num_jump_entries;
+ init = mod->state == MODULE_STATE_COMING;
+ }
}
- preempt_enable();
#endif
entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 951c93216fc4..049e296f586c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -29,29 +29,10 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
-/*
- * These will be re-linked against their real values
- * during the second link stage.
- */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const int kallsyms_offsets[] __weak;
-extern const u8 kallsyms_names[] __weak;
-
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned int kallsyms_num_syms
-__section(".rodata") __attribute__((weak));
-
-extern const unsigned long kallsyms_relative_base
-__section(".rodata") __attribute__((weak));
-
-extern const char kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
-
-extern const unsigned int kallsyms_markers[] __weak;
+#include "kallsyms_internal.h"
/*
* Expand a compressed symbol data into the resulting uncompressed string,
@@ -69,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -114,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off)
{
/*
* Get just the first code, look it up in the token table,
- * and return the first char from this token.
+ * and return the first char from this token. If MSB of length
+ * is 1, it is a "big" symbol, so needs an additional byte.
*/
+ if (kallsyms_names[off] & 0x80)
+ off++;
return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
@@ -127,7 +119,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -141,96 +133,116 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
- if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
- return kallsyms_addresses[idx];
+ /* values are unsigned offsets */
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+}
- /* values are unsigned offsets if --absolute-percpu is not in effect */
- if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
- return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
- /* ...otherwise, positive offsets are absolute values */
- if (kallsyms_offsets[idx] >= 0)
- return kallsyms_offsets[idx];
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
- /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
- return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+ return seq;
}
-static bool cleanup_symbol_name(char *s)
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
{
- char *res;
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
+ char namebuf[KSYM_NAME_LEN];
- if (!IS_ENABLED(CONFIG_LTO_CLANG))
- return false;
+ low = 0;
+ high = kallsyms_num_syms - 1;
- /*
- * LLVM appends various suffixes for local functions and variables that
- * must be promoted to global scope as part of LTO. This can break
- * hooking of static functions with kprobes. '.' is not a valid
- * character in an identifier in C. Suffixes observed:
- * - foo.llvm.[0-9a-f]+
- * - foo.[0-9a-f]+
- * - foo.[0-9a-f]+.cfi_jt
- */
- res = strchr(s, '.');
- if (res) {
- *res = '\0';
- return true;
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = strcmp(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
}
- if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
- !IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
- CONFIG_CLANG_VERSION >= 130000)
- return false;
+ if (low > high)
+ return -ESRCH;
- /*
- * Prior to LLVM 13, the following suffixes were observed when thinLTO
- * and CFI are both enabled:
- * - foo$[0-9]+
- */
- res = strrchr(s, '$');
- if (res) {
- *res = '\0';
- return true;
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (strcmp(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (strcmp(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
}
- return false;
+ return 0;
}
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
- char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
+ int ret;
+ unsigned int i;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
- if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
- }
return module_kallsyms_lookup_name(name);
}
-#ifdef CONFIG_LIVEPATCH
/*
* Iterate over all symbols in vmlinux. For symbols from modules use
* module_kallsyms_on_each_symbol instead.
*/
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
- unsigned long),
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
@@ -240,14 +252,31 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
+ ret = fn(data, namebuf, kallsyms_sym_address(i));
if (ret != 0)
return ret;
cond_resched();
}
return 0;
}
-#endif /* CONFIG_LIVEPATCH */
+
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
+}
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
@@ -256,13 +285,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
- /* This kernel should never had been booted. */
- if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
- BUG_ON(!kallsyms_addresses);
- else
- BUG_ON(!kallsyms_offsets);
-
- /* Do a binary search on the sorted kallsyms_addresses array. */
+ /* Do a binary search on the sorted kallsyms_offsets array. */
low = 0;
high = kallsyms_num_syms;
@@ -325,12 +348,12 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
-static const char *kallsyms_lookup_buildid(unsigned long addr,
+static int kallsyms_lookup_buildid(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset, char **modname,
const unsigned char **modbuildid, char *namebuf)
{
- const char *ret;
+ int ret;
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
@@ -347,8 +370,7 @@ static const char *kallsyms_lookup_buildid(unsigned long addr,
if (modbuildid)
*modbuildid = NULL;
- ret = namebuf;
- goto found;
+ return strlen(namebuf);
}
/* See if it's in a module or a BPF JITed image. */
@@ -362,8 +384,6 @@ static const char *kallsyms_lookup_buildid(unsigned long addr,
ret = ftrace_mod_address_lookup(addr, symbolsize,
offset, modname, namebuf);
-found:
- cleanup_symbol_name(namebuf);
return ret;
}
@@ -379,14 +399,17 @@ const char *kallsyms_lookup(unsigned long addr,
unsigned long *offset,
char **modname, char *namebuf)
{
- return kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
- NULL, namebuf);
+ int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
+ NULL, namebuf);
+
+ if (!ret)
+ return NULL;
+
+ return namebuf;
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
- int res;
-
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -397,44 +420,10 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- goto found;
- }
- /* See if it's in a module. */
- res = lookup_module_symbol_name(addr, symname);
- if (res)
- return res;
-
-found:
- cleanup_symbol_name(symname);
- return 0;
-}
-
-int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- int res;
-
- name[0] = '\0';
- name[KSYM_NAME_LEN - 1] = '\0';
-
- if (is_ksym_addr(addr)) {
- unsigned long pos;
-
- pos = get_symbol_pos(addr, size, offset);
- /* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos),
- name, KSYM_NAME_LEN);
- modname[0] = '\0';
- goto found;
+ return 0;
}
/* See if it's in a module. */
- res = lookup_module_symbol_attrs(addr, size, offset, modname, name);
- if (res)
- return res;
-
-found:
- cleanup_symbol_name(name);
- return 0;
+ return lookup_module_symbol_name(addr, symname);
}
/* Look up a kernel symbol and return it in a text buffer. */
@@ -443,19 +432,15 @@ static int __sprint_symbol(char *buffer, unsigned long address,
{
char *modname;
const unsigned char *buildid;
- const char *name;
unsigned long offset, size;
int len;
address += symbol_offset;
- name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
+ len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
buffer);
- if (!name)
+ if (!len)
return sprintf(buffer, "0x%lx", address - symbol_offset);
- if (name != buffer)
- strcpy(buffer, name);
- len = strlen(buffer);
offset -= symbol_offset;
if (add_offset)
@@ -571,7 +556,6 @@ int sprint_backtrace_build_id(char *buffer, unsigned long address)
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
- loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
loff_t pos_bpf_end;
@@ -584,29 +568,9 @@ struct kallsym_iter {
int show_value;
};
-int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name)
-{
- return -EINVAL;
-}
-
-static int get_ksymbol_arch(struct kallsym_iter *iter)
-{
- int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name);
-
- if (ret < 0) {
- iter->pos_arch_end = iter->pos;
- return 0;
- }
-
- return 1;
-}
-
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- int ret = module_get_kallsym(iter->pos - iter->pos_arch_end,
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
&iter->value, &iter->type,
iter->name, iter->module_name,
&iter->exported);
@@ -641,7 +605,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
int ret;
- strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
+ strscpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
@@ -661,7 +625,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter)
*/
static int get_ksymbol_kprobe(struct kallsym_iter *iter)
{
- strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
iter->exported = 0;
return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
&iter->value, &iter->type,
@@ -689,7 +653,6 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
if (new_pos == 0) {
- iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
iter->pos_bpf_end = 0;
@@ -705,10 +668,6 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if ((!iter->pos_arch_end || iter->pos_arch_end > pos) &&
- get_ksymbol_arch(iter))
- return 1;
-
if ((!iter->pos_mod_end || iter->pos_mod_end > pos) &&
get_ksymbol_mod(iter))
return 1;
@@ -796,41 +755,95 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
-static inline int kallsyms_for_perf(void)
+#ifdef CONFIG_BPF_SYSCALL
+
+struct bpf_iter__ksym {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kallsym_iter *, ksym);
+};
+
+static int ksym_prog_seq_show(struct seq_file *m, bool in_stop)
{
-#ifdef CONFIG_PERF_EVENTS
- extern int sysctl_perf_event_paranoid;
- if (sysctl_perf_event_paranoid <= 1)
- return 1;
-#endif
+ struct bpf_iter__ksym ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = m;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ksym = m ? m->private : NULL;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p)
+{
+ return ksym_prog_seq_show(m, false);
+}
+
+static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p)
+{
+ if (!p)
+ (void) ksym_prog_seq_show(m, true);
+ else
+ s_stop(m, p);
+}
+
+static const struct seq_operations bpf_iter_ksym_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = bpf_iter_ksym_seq_stop,
+ .show = bpf_iter_ksym_seq_show,
+};
+
+static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct kallsym_iter *iter = priv_data;
+
+ reset_iter(iter, 0);
+
+ /* cache here as in kallsyms_open() case; use current process
+ * credentials to tell BPF iterators if values should be shown.
+ */
+ iter->show_value = kallsyms_show_value(current_cred());
+
return 0;
}
-/*
- * We show kallsyms information even to normal users if we've enabled
- * kernel profiling and are explicitly not paranoid (so kptr_restrict
- * is clear, and sysctl_perf_event_paranoid isn't set).
- *
- * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
- * block even that).
- */
-bool kallsyms_show_value(const struct cred *cred)
-{
- switch (kptr_restrict) {
- case 0:
- if (kallsyms_for_perf())
- return true;
- fallthrough;
- case 1:
- if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
- CAP_OPT_NOAUDIT) == 0)
- return true;
- fallthrough;
- default:
- return false;
- }
+DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym)
+
+static const struct bpf_iter_seq_info ksym_iter_seq_info = {
+ .seq_ops = &bpf_iter_ksym_ops,
+ .init_seq_private = bpf_iter_ksym_init,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct kallsym_iter),
+};
+
+static struct bpf_iter_reg ksym_iter_reg_info = {
+ .target = "ksym",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ksym, ksym),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ksym_iter_seq_info,
+};
+
+BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter)
+
+static int __init bpf_ksym_iter_register(void)
+{
+ ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id;
+ return bpf_iter_reg_target(&ksym_iter_reg_info);
}
+late_initcall(bpf_ksym_iter_register);
+
+#endif /* CONFIG_BPF_SYSCALL */
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
new file mode 100644
index 000000000000..9633782f8250
--- /dev/null
+++ b/kernel/kallsyms_internal.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LINUX_KALLSYMS_INTERNAL_H_
+#define LINUX_KALLSYMS_INTERNAL_H_
+
+#include <linux/types.h>
+
+extern const int kallsyms_offsets[];
+extern const u8 kallsyms_names[];
+
+extern const unsigned int kallsyms_num_syms;
+extern const unsigned long kallsyms_relative_base;
+
+extern const char kallsyms_token_table[];
+extern const u16 kallsyms_token_index[];
+
+extern const unsigned int kallsyms_markers[];
+extern const u8 kallsyms_seqs_of_names[];
+
+#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..2b082a7e24a2
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,446 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc_noprof),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, unsigned long addr)
+{
+ u64 t0, t1, t;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ t0 = ktime_get_ns();
+ (void)kallsyms_lookup_name(name);
+ t1 = ktime_get_ns();
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static int find_symbol(void *data, const char *name, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (!strcmp(name, stat->name)) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr);
+ goto failed;
+ }
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0]))) {
+ pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n",
+ namebuf);
+ goto failed;
+ }
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt) {
+ pr_info("%s: Never found\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr) {
+ pr_info("%s: NULL lookup_addr?!\n", namebuf);
+ goto failed;
+ }
+ if (lookup_addr != stat->addrs[0]) {
+ pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt) {
+ pr_info("%s: j == save_cnt?!\n", namebuf);
+ goto failed;
+ }
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..7c1a65bd5f8d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -63,9 +63,9 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fd_rcu(task, idx);
- rcu_read_unlock();
+ file = fget_task(task, idx);
+ if (file)
+ fput(file);
return file;
}
@@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
- if (!task1 || !task2)
+ if (unlikely(!task1 || !task2))
goto err_no_task;
get_task_struct(task1);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e..6563141f5de9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,8 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -152,6 +154,21 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
+}
+
+/*
+ * Unlike in_serving_softirq(), this function returns false when called during
+ * a hardirq or an NMI that happened in the softirq context.
+ */
+static __always_inline bool in_softirq_really(void)
+{
+ return in_serving_softirq() && !in_hardirq() && !in_nmi();
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
@@ -163,7 +180,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru
* so we ignore code executed in interrupts, unless we are in a remote
* coverage collection section in a softirq.
*/
- if (!in_task() && !(in_serving_softirq() && t->kcov_softirq))
+ if (!in_task() && !(in_softirq_really() && t->kcov_softirq))
return false;
mode = READ_ONCE(t->kcov_mode);
/*
@@ -204,8 +221,16 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first 64-bit word is the number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
+ /* Previously we write pc before updating pos. However, some
+ * early interrupt code could bypass check_kcov_mode() check
+ * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+ * raised between writing pc and updating pos, the pc could be
+ * overitten by the recursive __sanitizer_cov_trace_pc().
+ * Update pos before writing pc to avoid such interleaving.
+ */
WRITE_ONCE(area[0], pos);
+ barrier();
+ area[pos] = ip;
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
@@ -236,11 +261,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
start_index = 1 + count * KCOV_WORDS_PER_CMP;
end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
if (likely(end_pos <= max_pos)) {
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ WRITE_ONCE(area[0], count + 1);
+ barrier();
area[start_index] = type;
area[start_index + 1] = arg1;
area[start_index + 2] = arg2;
area[start_index + 3] = ip;
- WRITE_ONCE(area[0], count + 1);
}
}
@@ -262,7 +289,7 @@ void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
-void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
}
@@ -289,16 +316,17 @@ void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
-void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
_RET_IP_);
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
-void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg)
{
u64 i;
+ u64 *cases = arg;
u64 count = cases[0];
u64 size = cases[1];
u64 type = KCOV_CMP_CONST;
@@ -459,37 +487,31 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vm_flags_set(vma, VM_DONTEXPAND);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ res = vm_insert_page(vma, vma->vm_start + off, page);
+ if (res) {
+ pr_warn_once("kcov: vm_insert_page() failed\n");
+ return res;
}
- return 0;
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -530,7 +552,7 @@ static int kcov_get_mode(unsigned long arg)
/*
* Fault in a lazily-faulted vmalloc area before it can be used by
- * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the
* vmalloc fault handling path is instrumented.
*/
static void kcov_fault_in_area(struct kcov *kcov)
@@ -564,31 +586,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -634,10 +637,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
mode = kcov_get_mode(remote_arg->trace_mode);
if (mode < 0)
return mode;
- if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long))
+ if ((unsigned long)remote_arg->area_size >
+ LONG_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->mode = mode;
t->kcov = kcov;
+ t->kcov_mode = KCOV_MODE_REMOTE;
kcov->t = t;
kcov->remote = true;
kcov->remote_size = remote_arg->area_size;
@@ -692,9 +697,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -710,16 +743,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
@@ -824,7 +859,7 @@ void kcov_remote_start(u64 handle)
if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
return;
- if (!in_task() && !in_serving_softirq())
+ if (!in_task() && !in_softirq_really())
return;
local_lock_irqsave(&kcov_percpu_data.lock, flags);
@@ -943,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
memcpy(dst_entries, src_entries, bytes_to_move);
entries_moved = bytes_to_move >> entry_size_log;
+ /*
+ * A write memory barrier is required here, to ensure
+ * that the writes from the memcpy() are visible before
+ * the count is updated. Without this, it is possible for
+ * a user to observe a new count value but stale
+ * coverage data.
+ */
+ smp_wmb();
+
switch (mode) {
case KCOV_MODE_TRACE_PC:
WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
@@ -966,7 +1010,7 @@ void kcov_remote_stop(void)
int sequence;
unsigned long flags;
- if (!in_task() && !in_serving_softirq())
+ if (!in_task() && !in_softirq_really())
return;
local_lock_irqsave(&kcov_percpu_data.lock, flags);
@@ -1033,6 +1077,32 @@ u64 kcov_common_handle(void)
}
EXPORT_SYMBOL(kcov_common_handle);
+#ifdef CONFIG_KCOV_SELFTEST
+static void __init selftest(void)
+{
+ unsigned long start;
+
+ pr_err("running self test\n");
+ /*
+ * Test that interrupts don't produce spurious coverage.
+ * The coverage callback filters out interrupt code, but only
+ * after the handler updates preempt count. Some code periodically
+ * leaks out of that section and leads to spurious coverage.
+ * It's hard to call the actual interrupt handler directly,
+ * so we just loop here for a bit waiting for a timer interrupt.
+ * We set kcov_mode to enable tracing, but don't setup the area,
+ * so any attempt to trace will crash. Note: we must not call any
+ * potentially traced functions in this region.
+ */
+ start = jiffies;
+ current->kcov_mode = KCOV_MODE_TRACE_PC;
+ while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
+ ;
+ current->kcov_mode = 0;
+ pr_err("done running self test\n");
+}
+#endif
+
static int __init kcov_init(void)
{
int cpu;
@@ -1052,6 +1122,10 @@ static int __init kcov_init(void)
*/
debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+#ifdef CONFIG_KCOV_SELFTEST
+ selftest();
+#endif
+
return 0;
}
diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig
new file mode 100644
index 000000000000..e82f0f52ab0a
--- /dev/null
+++ b/kernel/kcsan/.kunitconfig
@@ -0,0 +1,24 @@
+# Note that the KCSAN tests need to run on an SMP setup.
+# Under kunit_tool, this can be done by using the --qemu_args
+# option to configure a machine with several cores. For example:
+# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \
+# --arch=x86_64 --qemu_args="-smp 8"
+
+CONFIG_KUNIT=y
+
+CONFIG_DEBUG_KERNEL=y
+
+# Need some level of concurrency to test a concurrency sanitizer.
+CONFIG_SMP=y
+
+CONFIG_KCSAN=y
+CONFIG_KCSAN_KUNIT_TEST=y
+
+# Set these if you want to run test_barrier_nothreads
+#CONFIG_KCSAN_STRICT=y
+#CONFIG_KCSAN_WEAK_MEMORY=y
+
+# This prevents the test from timing out on many setups. Feel free to remove
+# (or alter) this, in conjunction with setting a different test timeout with,
+# for example, the --timeout kunit_tool option.
+CONFIG_KCSAN_REPORT_ONCE_IN_MS=100
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 4f35d1bced6a..a45f3dfc8d14 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -16,5 +16,6 @@ obj-y := core.o debugfs.o report.o
KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index fe12dfe254ec..8a7baf4e332e 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -14,10 +14,12 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include <linux/uaccess.h>
#include "encoding.h"
@@ -335,11 +337,20 @@ static void delay_access(int type)
*/
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
switch (size) {
- case 1: return READ_ONCE(*(const u8 *)ptr);
- case 2: return READ_ONCE(*(const u16 *)ptr);
- case 4: return READ_ONCE(*(const u32 *)ptr);
- case 8: return READ_ONCE(*(const u64 *)ptr);
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
default: return 0; /* Ignore; we do not diff the values. */
}
}
@@ -1259,7 +1270,9 @@ static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
DEFINE_TSAN_ATOMIC_OPS(8);
DEFINE_TSAN_ATOMIC_OPS(16);
DEFINE_TSAN_ATOMIC_OPS(32);
+#ifdef CONFIG_64BIT
DEFINE_TSAN_ATOMIC_OPS(64);
+#endif
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
@@ -1308,3 +1321,51 @@ noinline void __tsan_atomic_signal_fence(int memorder)
}
}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 1d1d1b0e4248..2af39ba5b70b 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -46,14 +46,8 @@ static struct {
int used; /* number of elements used */
bool sorted; /* if elements are sorted */
bool whitelist; /* if list is a blacklist or whitelist */
-} report_filterlist = {
- .addrs = NULL,
- .size = 8, /* small initial size */
- .used = 0,
- .sorted = false,
- .whitelist = false, /* default is blacklist */
-};
-static DEFINE_SPINLOCK(report_filterlist_lock);
+} report_filterlist;
+static DEFINE_RAW_SPINLOCK(report_filterlist_lock);
/*
* The microbenchmark allows benchmarking KCSAN core runtime only. To run
@@ -110,7 +104,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
return false;
func_addr -= offset; /* Get function start */
- spin_lock_irqsave(&report_filterlist_lock, flags);
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
if (report_filterlist.used == 0)
goto out;
@@ -127,7 +121,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
ret = !ret;
out:
- spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
return ret;
}
@@ -135,9 +129,9 @@ static void set_report_filterlist_whitelist(bool whitelist)
{
unsigned long flags;
- spin_lock_irqsave(&report_filterlist_lock, flags);
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
report_filterlist.whitelist = whitelist;
- spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
}
/* Returns 0 on success, error-code otherwise. */
@@ -145,6 +139,9 @@ static ssize_t insert_report_filterlist(const char *func)
{
unsigned long flags;
unsigned long addr = kallsyms_lookup_name(func);
+ unsigned long *delay_free = NULL;
+ unsigned long *new_addrs = NULL;
+ size_t new_size = 0;
ssize_t ret = 0;
if (!addr) {
@@ -152,42 +149,42 @@ static ssize_t insert_report_filterlist(const char *func)
return -ENOENT;
}
- spin_lock_irqsave(&report_filterlist_lock, flags);
+retry_alloc:
+ /*
+ * Check if we need an allocation, and re-validate under the lock. Since
+ * the report_filterlist_lock is a raw, cannot allocate under the lock.
+ */
+ if (data_race(report_filterlist.used == report_filterlist.size)) {
+ new_size = (report_filterlist.size ?: 4) * 2;
+ delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL);
+ if (!new_addrs)
+ return -ENOMEM;
+ }
- if (report_filterlist.addrs == NULL) {
- /* initial allocation */
- report_filterlist.addrs =
- kmalloc_array(report_filterlist.size,
- sizeof(unsigned long), GFP_ATOMIC);
- if (report_filterlist.addrs == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- } else if (report_filterlist.used == report_filterlist.size) {
- /* resize filterlist */
- size_t new_size = report_filterlist.size * 2;
- unsigned long *new_addrs =
- krealloc(report_filterlist.addrs,
- new_size * sizeof(unsigned long), GFP_ATOMIC);
-
- if (new_addrs == NULL) {
- /* leave filterlist itself untouched */
- ret = -ENOMEM;
- goto out;
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+ if (report_filterlist.used == report_filterlist.size) {
+ /* Check we pre-allocated enough, and retry if not. */
+ if (report_filterlist.used >= new_size) {
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ kfree(new_addrs); /* kfree(NULL) is safe */
+ delay_free = new_addrs = NULL;
+ goto retry_alloc;
}
+ if (report_filterlist.used)
+ memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long));
+ delay_free = report_filterlist.addrs; /* free the old list */
+ report_filterlist.addrs = new_addrs; /* switch to the new list */
report_filterlist.size = new_size;
- report_filterlist.addrs = new_addrs;
}
/* Note: deduplicating should be done in userspace. */
- report_filterlist.addrs[report_filterlist.used++] =
- kallsyms_lookup_name(func);
+ report_filterlist.addrs[report_filterlist.used++] = addr;
report_filterlist.sorted = false;
-out:
- spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ kfree(delay_free);
return ret;
}
@@ -204,13 +201,13 @@ static int show_info(struct seq_file *file, void *v)
}
/* show filter functions, and filter type */
- spin_lock_irqsave(&report_filterlist_lock, flags);
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
seq_printf(file, "\n%s functions: %s\n",
report_filterlist.whitelist ? "whitelisted" : "blacklisted",
report_filterlist.used == 0 ? "none" : "");
for (i = 0; i < report_filterlist.used; ++i)
seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
- spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
return 0;
}
@@ -225,7 +222,7 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o
{
char kbuf[KSYM_NAME_LEN];
char *arg;
- int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1);
+ const size_t read_len = min(count, sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
return -EFAULT;
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a36fca063a73..219d22857c98 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -125,7 +125,7 @@ static void probe_console(void *ignore, const char *buf, size_t len)
goto out;
/* No second line of interest. */
- strcpy(observed.lines[nlines++], "<none>");
+ strscpy(observed.lines[nlines++], "<none>");
}
}
@@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r)
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
unsigned long flags;
- typeof(observed.lines) expect;
+ typeof(*observed.lines) *expect;
const char *end;
char *cur;
int i;
@@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
/* Generate expected report contents. */
/* Title */
@@ -227,7 +231,7 @@ static bool __report_matches(const struct expect_report *r)
if (!r->access[1].fn) {
/* Dummy string if no second access is available. */
- strcpy(cur, "<none>");
+ strscpy(expect[2], "<none>");
break;
}
}
@@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r)
strstr(observed.lines[2], expect[1])));
out:
spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
return ret;
}
@@ -299,6 +304,7 @@ static long test_array[3 * PAGE_SIZE / sizeof(long)];
static struct {
long val[8];
} test_struct;
+static long __data_racy test_data_racy;
static DEFINE_SEQLOCK(test_seqlock);
static DEFINE_SPINLOCK(test_spinlock);
static DEFINE_MUTEX(test_mutex);
@@ -353,6 +359,8 @@ static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; }
+
static noinline void test_kernel_assert_writer(void)
{
ASSERT_EXCLUSIVE_WRITER(test_var);
@@ -525,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test)
struct kcsan_scoped_access *reorder_access = NULL;
#endif
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
- atomic_t dummy;
+ atomic_t dummy = ATOMIC_INIT(0);
KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
@@ -694,12 +702,9 @@ static void test_barrier_nothreads(struct kunit *test)
KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
-#endif
+ KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
kcsan_nestable_atomic_end();
}
@@ -1007,6 +1012,19 @@ static void test_data_race(struct kunit *test)
KUNIT_EXPECT_FALSE(test, match_never);
}
+/* Test the __data_racy type qualifier. */
+__no_kcsan
+static void test_data_racy_qualifier(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
@@ -1365,7 +1383,7 @@ static void test_atomic_builtins_missing_barrier(struct kunit *test)
* The thread counts are chosen to cover potentially interesting boundaries and
* corner cases (2 to 5), and then stress the system with larger counts.
*/
-static const void *nthreads_gen_params(const void *prev, char *desc)
+static const void *nthreads_gen_params(struct kunit *test, const void *prev, char *desc)
{
long nthreads = (long)prev;
@@ -1380,13 +1398,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else
nthreads *= 2;
- if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/*
* Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for
* completion or failure.
*/
- const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) {
@@ -1421,6 +1440,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
KCSAN_KUNIT_CASE(test_zero_size_access),
KCSAN_KUNIT_CASE(test_data_race),
+ KCSAN_KUNIT_CASE(test_data_racy_qualifier),
KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
KCSAN_KUNIT_CASE(test_assert_exclusive_access),
KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
@@ -1480,8 +1500,8 @@ static int access_thread(void *arg)
func();
}
} while (!torture_must_stop());
- del_timer_sync(&timer);
- destroy_timer_on_stack(&timer);
+ timer_delete_sync(&timer);
+ timer_destroy_on_stack(&timer);
torture_kthread_stopping("access_thread");
return 0;
@@ -1565,53 +1585,41 @@ static void test_exit(struct kunit *test)
torture_cleanup_end();
}
-static struct kunit_suite kcsan_test_suite = {
- .name = "kcsan",
- .test_cases = kcsan_test_cases,
- .init = test_init,
- .exit = test_exit,
-};
-static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
-
__no_kcsan
-static void register_tracepoints(struct tracepoint *tp, void *ignore)
+static void register_tracepoints(void)
{
- check_trace_callback_type_console(probe_console);
- if (!strcmp(tp->name, "console"))
- WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+ register_trace_console(probe_console, NULL);
}
__no_kcsan
-static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+static void unregister_tracepoints(void)
{
- if (!strcmp(tp->name, "console"))
- tracepoint_probe_unregister(tp, probe_console, NULL);
+ unregister_trace_console(probe_console, NULL);
}
-/*
- * We only want to do tracepoints setup and teardown once, therefore we have to
- * customize the init and exit functions and cannot rely on kunit_test_suite().
- */
-static int __init kcsan_test_init(void)
+static int kcsan_suite_init(struct kunit_suite *suite)
{
- /*
- * Because we want to be able to build the test as a module, we need to
- * iterate through all known tracepoints, since the static registration
- * won't work here.
- */
- for_each_kernel_tracepoint(register_tracepoints, NULL);
- return __kunit_test_suites_init(kcsan_test_suites);
+ register_tracepoints();
+ return 0;
}
-static void kcsan_test_exit(void)
+static void kcsan_suite_exit(struct kunit_suite *suite)
{
- __kunit_test_suites_exit(kcsan_test_suites);
- for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ unregister_tracepoints();
tracepoint_synchronize_unregister();
}
-late_initcall_sync(kcsan_test_init);
-module_exit(kcsan_test_exit);
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kcsan_suite_init,
+ .suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
+MODULE_DESCRIPTION("KCSAN test suite");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 67794404042a..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change,
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 75712959c84e..84a1200271af 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -22,13 +22,6 @@
#define ITERS_PER_TEST 2000
-/* Test requirements. */
-static bool __init test_requires(void)
-{
- /* random should be initialized for the below tests */
- return prandom_u32() + prandom_u32() != 0;
-}
-
/*
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
@@ -38,15 +31,15 @@ static bool __init test_encode_decode(void)
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
unsigned long verif_masked_addr;
long encoded_watchpoint;
bool verif_is_write;
unsigned long addr;
size_t verif_size;
- prandom_bytes(&addr, sizeof(addr));
+ get_random_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;
@@ -234,12 +227,9 @@ static bool __init test_barrier(void)
KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
-#endif
+ KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
kcsan_nestable_atomic_end();
return ret;
@@ -259,7 +249,6 @@ static int __init kcsan_selftest(void)
pr_err("selftest: " #do_test " failed"); \
} while (0)
- RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
RUN_TEST(test_barrier);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b5e40f069768..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Verify we have a valid entry point */
if ((entry < phys_to_boot_phys(crashk_res.start)) ||
(entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
+#endif
/* Allocate and initialize a controlling structure */
image = do_kimage_alloc_init();
@@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
image->nr_segments = nr_segments;
memcpy(image->segment, segments, nr_segments * sizeof(*segments));
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = sanity_check_segment_list(image);
if (ret)
@@ -93,22 +97,20 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/*
* Because we write directly to the reserved memory region when loading
- * crash kernels we need a mutex here to prevent multiple crash kernels
- * from attempting to load simultaneously, and to prevent a crash kernel
- * from loading over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
*/
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
+#ifdef CONFIG_CRASH_DUMP
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- } else {
+ } else
+#endif
dest_image = &kexec_image;
- }
if (nr_segments == 0) {
/* Uninstall image */
@@ -132,6 +134,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+ image->hotplug_support = 1;
+#endif
+
ret = machine_kexec_prepare(image);
if (ret)
goto out;
@@ -145,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
goto out;
for (i = 0; i < nr_segments; i++) {
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
@@ -160,12 +167,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
kimage_free(image);
out_unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -193,10 +202,12 @@ out_unlock:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
@@ -243,7 +254,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+ ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
if (IS_ERR(ksegments))
return PTR_ERR(ksegments);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 68480f731192..0f92acdd354d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -14,6 +15,7 @@
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/liveupdate.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
@@ -39,6 +41,8 @@
#include <linux/hugetlb.h>
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
+#include <linux/sysfs.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -46,54 +50,12 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in make_task_dead() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-int kexec_crash_loaded(void)
-{
- return !!kexec_crash_image;
-}
-EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+bool kexec_file_dbg_print;
/*
* When kexec transitions to the new kernel there is a one-to-one
@@ -226,6 +188,7 @@ int sanity_check_segment_list(struct kimage *image)
if (total_pages > nr_pages / 2)
return -EINVAL;
+#ifdef CONFIG_CRASH_DUMP
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -248,6 +211,17 @@ int sanity_check_segment_list(struct kimage *image)
return -EADDRNOTAVAIL;
}
}
+#endif
+
+ /*
+ * The destination addresses are searched from system RAM rather than
+ * being allocated from the buddy allocator, so they are not guaranteed
+ * to be accepted by the current kernel. Accept the destination
+ * addresses before kexec swaps their content with the segments' source
+ * pages to avoid accessing memory before it is accepted.
+ */
+ for (i = 0; i < nr_segments; i++)
+ accept_memory(image->segment[i].mem, image->segment[i].memsz);
return 0;
}
@@ -261,7 +235,6 @@ struct kimage *do_kimage_alloc_init(void)
if (!image)
return NULL;
- image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
image->control_page = ~0; /* By default this does not apply */
@@ -276,6 +249,12 @@ struct kimage *do_kimage_alloc_init(void)
/* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unusable_pages);
+#ifdef CONFIG_CRASH_HOTPLUG
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_index = -1;
+ image->elfcorehdr_updated = false;
+#endif
+
return image;
}
@@ -289,8 +268,8 @@ int kimage_is_destination_range(struct kimage *image,
unsigned long mstart, mend;
mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((end >= mstart) && (start <= mend))
return 1;
}
@@ -383,7 +362,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
+ eaddr = (epfn << PAGE_SHIFT) - 1;
if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
kimage_is_destination_range(image, addr, eaddr)) {
list_add(&pages->lru, &extra_pages);
@@ -414,6 +393,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
return pages;
}
+#ifdef CONFIG_CRASH_DUMP
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
unsigned int order)
{
@@ -443,7 +423,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
pages = NULL;
size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(image->control_page, size);
hole_end = hole_start + size - 1;
while (hole_end <= crashk_res.end) {
unsigned long i;
@@ -460,7 +440,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
mend = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
/* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(mend, size);
hole_end = hole_start + size - 1;
break;
}
@@ -468,7 +448,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- image->control_page = hole_end;
+ image->control_page = hole_end + 1;
break;
}
}
@@ -479,6 +459,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
return pages;
}
+#endif
struct page *kimage_alloc_control_pages(struct kimage *image,
@@ -490,48 +471,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
case KEXEC_TYPE_DEFAULT:
pages = kimage_alloc_normal_control_pages(image, order);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
pages = kimage_alloc_crash_control_pages(image, order);
break;
+#endif
}
return pages;
}
-int kimage_crash_copy_vmcoreinfo(struct kimage *image)
-{
- struct page *vmcoreinfo_page;
- void *safecopy;
-
- if (image->type != KEXEC_TYPE_CRASH)
- return 0;
-
- /*
- * For kdump, allocate one vmcoreinfo safe copy from the
- * crash memory. as we have arch_kexec_protect_crashkres()
- * after kexec syscall, we naturally protect it from write
- * (even read) access under kernel direct mapping. But on
- * the other hand, we still need to operate it when crash
- * happens to generate vmcoreinfo note, hereby we rely on
- * vmap for this purpose.
- */
- vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
- if (!vmcoreinfo_page) {
- pr_warn("Could not allocate vmcoreinfo buffer\n");
- return -ENOMEM;
- }
- safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
- if (!safecopy) {
- pr_warn("Could not vmap vmcoreinfo buffer\n");
- return -ENOMEM;
- }
-
- image->vmcoreinfo_data_copy = safecopy;
- crash_update_vmcoreinfo_safecopy(safecopy);
-
- return 0;
-}
-
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
if (*image->entry != 0)
@@ -561,23 +510,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
@@ -591,11 +534,6 @@ static void kimage_free_extra_pages(struct kimage *image)
}
-int __weak machine_kexec_post_load(struct kimage *image)
-{
- return 0;
-}
-
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -617,6 +555,24 @@ static void kimage_free_entry(kimage_entry_t entry)
kimage_free_pages(page);
}
+static void kimage_free_cma(struct kimage *image)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct page *cma = image->segment_cma[i];
+ u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+ if (!cma)
+ continue;
+
+ arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+ dma_release_from_contiguous(NULL, cma, nr_pages);
+ image->segment_cma[i] = NULL;
+ }
+
+}
+
void kimage_free(struct kimage *image)
{
kimage_entry_t *ptr, entry;
@@ -625,10 +581,12 @@ void kimage_free(struct kimage *image)
if (!image)
return;
+#ifdef CONFIG_CRASH_DUMP
if (image->vmcoreinfo_data_copy) {
crash_update_vmcoreinfo_safecopy(NULL);
vunmap(image->vmcoreinfo_data_copy);
}
+#endif
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
@@ -653,6 +611,9 @@ void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+ /* Free CMA allocations */
+ kimage_free_cma(image);
+
/*
* Free up any temporary buffers allocated. This might hit if
* error occurred much later after buffer allocation.
@@ -740,7 +701,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page is not a destination page use it */
if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
+ addr + PAGE_SIZE - 1))
break;
/*
@@ -768,7 +729,6 @@ static struct page *kimage_alloc_page(struct kimage *image,
kimage_free_pages(old_page);
continue;
}
- addr = old_addr;
page = old_page;
break;
}
@@ -779,16 +739,70 @@ static struct page *kimage_alloc_page(struct kimage *image,
return page;
}
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
+static int kimage_load_cma_segment(struct kimage *image, int idx)
{
+ struct kexec_segment *segment = &image->segment[idx];
+ struct page *cma = image->segment_cma[idx];
+ char *ptr = page_address(cma);
+ size_t ubytes, mbytes;
+ int result = 0;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+
+ /* Then copy from source buffer to the CMA one */
+ while (mbytes) {
+ size_t uchunk, mchunk;
+
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ ptr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+
+ /* Clear any remainder */
+ memset(ptr, 0, mbytes);
+
+out:
+ return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
+{
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
unsigned char __user *buf = NULL;
unsigned char *kbuf = NULL;
- result = 0;
if (image->file_mode)
kbuf = segment->kbuf;
else
@@ -797,6 +811,9 @@ static int kimage_load_normal_segment(struct kimage *image,
mbytes = segment->memsz;
maddr = segment->mem;
+ if (image->segment_cma[idx])
+ return kimage_load_cma_segment(image, idx);
+
result = kimage_set_destination(image, maddr);
if (result < 0)
goto out;
@@ -816,30 +833,30 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+ kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -848,13 +865,14 @@ out:
return result;
}
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
+#ifdef CONFIG_CRASH_DUMP
+static int kimage_load_crash_segment(struct kimage *image, int idx)
{
/* For crash dumps kernels we simply copy the data from
* user space to it's destination.
* We do things a page at a time for the sake of kmap.
*/
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
@@ -880,34 +898,34 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ ptr = kmap_local_page(page);
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (mchunk > uchunk) {
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
kexec_flush_icache_page(page);
- kunmap(page);
+ kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -915,208 +933,196 @@ static int kimage_load_crash_segment(struct kimage *image,
out:
return result;
}
+#endif
-int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
+int kimage_load_segment(struct kimage *image, int idx)
{
int result = -ENOMEM;
switch (image->type) {
case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
+ result = kimage_load_normal_segment(image, idx);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
+ result = kimage_load_crash_segment(image, idx);
break;
+#endif
}
return result;
}
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-/*
- * No panic_cpu check version of crash_kexec(). This function is called
- * only when panic_cpu holds the current CPU number; this is the only CPU
- * which processes crash_kexec routines.
- */
-void __noclone __crash_kexec(struct pt_regs *regs)
+void *kimage_map_segment(struct kimage *image,
+ unsigned long addr, unsigned long size)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-STACK_FRAME_NON_STANDARD(__crash_kexec);
-
-void crash_kexec(struct pt_regs *regs)
-{
- int old_cpu, this_cpu;
+ unsigned long src_page_addr, dest_page_addr = 0;
+ unsigned long eaddr = addr + size;
+ kimage_entry_t *ptr, entry;
+ struct page **src_pages;
+ unsigned int npages;
+ void *vaddr = NULL;
+ int i;
/*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
+ * Collect the source pages and map them in a contiguous VA range.
*/
- this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu == PANIC_CPU_INVALID) {
- /* This is the 1st CPU which comes here, so go ahead. */
- __crash_kexec(regs);
+ npages = PFN_UP(eaddr) - PFN_DOWN(addr);
+ src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL);
+ if (!src_pages) {
+ pr_err("Could not allocate ima pages array.\n");
+ return NULL;
+ }
- /*
- * Reset panic_cpu to allow another panic()/crash_kexec()
- * call.
- */
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ i = 0;
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION) {
+ dest_page_addr = entry & PAGE_MASK;
+ } else if (entry & IND_SOURCE) {
+ if (dest_page_addr >= addr && dest_page_addr < eaddr) {
+ src_page_addr = entry & PAGE_MASK;
+ src_pages[i++] =
+ virt_to_page(__va(src_page_addr));
+ if (i == npages)
+ break;
+ dest_page_addr += PAGE_SIZE;
+ }
+ }
}
-}
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
+ /* Sanity check. */
+ WARN_ON(i < npages);
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
+ vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL);
+ kfree(src_pages);
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
+ if (!vaddr)
+ pr_err("Could not map ima buffer.\n");
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
+ return vaddr;
}
-int crash_shrink_memory(unsigned long new_size)
+void kimage_unmap_segment(void *segment_buffer)
{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
+ vunmap(segment_buffer);
+}
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
- crash_free_reserved_phys_range(end, crashk_res.end);
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+static int kexec_load_disabled;
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
+#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
- ram_res->name = "System RAM";
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
- crashk_res.end = end - 1;
+ return ret;
+ }
- insert_resource(&iomem_resource, ram_res);
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
}
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
+static const struct ctl_table kexec_core_sysctls[] = {
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+};
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.common.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
+static int __init kexec_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", kexec_core_sysctls);
+ return 0;
}
+late_initcall(kexec_core_sysctl_init);
+#endif
-static int __init crash_notes_memory_init(void)
+bool kexec_load_permitted(int kexec_image_type)
{
- /* Allocate memory for saving cpu registers. */
- size_t size, align;
-
- /*
- * crash_notes could be allocated across 2 vmalloc pages when percpu
- * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
- * pages are also on 2 continuous physical pages. In this case the
- * 2nd part of crash_notes in 2nd page could be lost since only the
- * starting address and size of crash_notes are exported through sysfs.
- * Here round up the size of crash_notes to the nearest power of two
- * and pass it to __alloc_percpu as align value. This can make sure
- * crash_notes is allocated inside one physical page.
- */
- size = sizeof(note_buf_t);
- align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+ struct kexec_load_limit *limit;
/*
- * Break compile if size is bigger than PAGE_SIZE since crash_notes
- * definitely will be in 2 pages with that.
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
*/
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- crash_notes = __alloc_percpu(size, align);
- if (!crash_notes) {
- pr_warn("Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
}
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+ return true;
+}
/*
* Move into place and start executing a preloaded standalone
@@ -1126,31 +1132,39 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
goto Unlock;
}
+ error = liveupdate_reboot();
+ if (error)
+ goto Unlock;
+
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur
+ * before creating an image and before jumping from the
+ * restore kernel to the image one, so it uses the same
+ * device callbacks as those two flows.
+ */
pm_prepare_console();
error = freeze_processes();
if (error) {
error = -EBUSY;
goto Restore_console;
}
- suspend_console();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
+ goto Resume_devices;
+ /*
+ * dpm_suspend_end() must be called after dpm_suspend_start()
+ * to complete the transition, like in the hibernation flows
+ * mentioned above.
*/
error = dpm_suspend_end(PMSG_FREEZE);
if (error)
@@ -1168,6 +1182,7 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare("kexec reboot");
migrate_to_reboot_cpu();
+ syscore_shutdown();
/*
* migrate_to_reboot_cpu() disables CPU hotplug assuming that
@@ -1185,6 +1200,13 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur after
+ * creating an image and after the image kernel has got control
+ * back, and in case the devices have been reset or otherwise
+ * manipulated in the meantime, it uses the device callbacks
+ * used by the latter.
+ */
syscore_resume();
Enable_irqs:
local_irq_enable();
@@ -1193,8 +1215,7 @@ int kernel_kexec(void)
dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
+ console_resume_all();
thaw_processes();
Restore_console:
pm_restore_console();
@@ -1202,19 +1223,146 @@ int kernel_kexec(void)
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_kexec_protect_crashkres(void)
-{}
+static ssize_t loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
+}
+static struct kobj_attribute loaded_attr = __ATTR_RO(loaded);
+
+#ifdef CONFIG_CRASH_DUMP
+static ssize_t crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
+}
+static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded);
+
+#ifdef CONFIG_CRASH_RESERVE
+static ssize_t crash_cma_ranges_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ len += sysfs_emit_at(buf, len, "%08llx-%08llx\n",
+ crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ }
+ return len;
+}
+static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges);
+#endif
+
+static ssize_t crash_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
+}
+static ssize_t crash_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long cnt;
+ int ret;
+
+ if (kstrtoul(buf, 0, &cnt))
+ return -EINVAL;
+
+ ret = crash_shrink_memory(cnt);
+ return ret < 0 ? ret : count;
+}
+static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size);
+
+#endif /* CONFIG_CRASH_HOTPLUG */
+#endif /* CONFIG_CRASH_DUMP */
+
+static struct attribute *kexec_attrs[] = {
+ &loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
+ &crash_loaded_attr.attr,
+ &crash_size_attr.attr,
+#ifdef CONFIG_CRASH_RESERVE
+ &crash_cma_ranges_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
+#endif
+ NULL
+};
+
+struct kexec_link_entry {
+ const char *target;
+ const char *name;
+};
+
+static struct kexec_link_entry kexec_links[] = {
+ { "loaded", "kexec_loaded" },
+#ifdef CONFIG_CRASH_DUMP
+ { "crash_loaded", "kexec_crash_loaded" },
+ { "crash_size", "kexec_crash_size" },
+#ifdef CONFIG_CRASH_RESERVE
+ {"crash_cma_ranges", "kexec_crash_cma_ranges"},
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ { "crash_elfcorehdr_size", "crash_elfcorehdr_size" },
+#endif
+#endif
+};
+
+static struct kobject *kexec_kobj;
+ATTRIBUTE_GROUPS(kexec);
+
+static int __init init_kexec_sysctl(void)
+{
+ int error;
+ int i;
+
+ kexec_kobj = kobject_create_and_add("kexec", kernel_kobj);
+ if (!kexec_kobj) {
+ pr_err("failed to create kexec kobject\n");
+ return -ENOMEM;
+ }
+
+ error = sysfs_create_groups(kexec_kobj, kexec_groups);
+ if (error)
+ goto kset_exit;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_links); i++) {
+ error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj,
+ kexec_links[i].target,
+ kexec_links[i].name);
+ if (error)
+ pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error);
+ }
+
+ return 0;
+
+kset_exit:
+ kobject_put(kexec_kobj);
+ return error;
+}
-void __weak arch_kexec_unprotect_crashkres(void)
-{}
+subsys_initcall(init_kexec_sysctl);
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
index d3689632e8b9..3a5c25b2adc9 100644
--- a/kernel/kexec_elf.c
+++ b/kernel/kexec_elf.c
@@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
struct kexec_buf *kbuf,
unsigned long *lowest_load_addr)
{
- unsigned long lowest_addr = UINT_MAX;
+ unsigned long lowest_addr = ULONG_MAX;
int ret;
size_t i;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..eb62a9794242 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,7 +19,6 @@
#include <linux/list.h>
#include <linux/fs.h>
#include <linux/ima.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
@@ -27,10 +26,38 @@
#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
#include "kexec_internal.h"
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+#endif
+
+#ifdef CONFIG_IMA_KEXEC
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ if (image->is_ima_segment_index_set && i == image->ima_segment_index)
+ return true;
+ else
+ return false;
+}
+#else
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ return false;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
+/* Maximum size in bytes for kernel/initrd files. */
+#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX)
+
/*
* Currently this is the only default function that is exported as some
* architectures need it to do additional handlings.
@@ -53,13 +80,6 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
return ret;
}
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_probe_default(image, buf, buf_len);
-}
-
static void *kexec_image_load_default(struct kimage *image)
{
if (!image->fops || !image->fops->load)
@@ -71,11 +91,6 @@ static void *kexec_image_load_default(struct kimage *image)
image->cmdline_buf_len);
}
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return kexec_image_load_default(image);
-}
-
int kexec_image_post_load_cleanup_default(struct kimage *image)
{
if (!image->fops || !image->fops->cleanup)
@@ -84,64 +99,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- return kexec_image_post_load_cleanup_default(image);
-}
-
-#ifdef CONFIG_KEXEC_SIG
-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- if (!image->fops || !image->fops->verify_sig) {
- pr_debug("kernel loader does not support signature verification.\n");
- return -EKEYREJECTED;
- }
-
- return image->fops->verify_sig(buf, buf_len);
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_verify_sig_default(image, buf, buf_len);
-}
-#endif
-
-/*
- * arch_kexec_apply_relocations_add - apply relocations of type RELA
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELAs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * arch_kexec_apply_relocations - apply relocations of type REL
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
/*
* Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
@@ -181,19 +138,49 @@ void kimage_file_post_load_cleanup(struct kimage *image)
*/
kfree(image->image_loader_data);
image->image_loader_data = NULL;
+
+ kexec_file_dbg_print = false;
}
#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
+}
+#endif
+
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
static int
kimage_validate_signature(struct kimage *image)
{
int ret;
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
+ ret = kexec_image_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
if (ret) {
- if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ if (sig_enforce) {
pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
@@ -214,6 +201,15 @@ kimage_validate_signature(struct kimage *image)
}
#endif
+static int kexec_post_load(struct kimage *image, unsigned long flags)
+{
+#ifdef CONFIG_IMA_KEXEC
+ if (!(flags & KEXEC_FILE_ON_CRASH))
+ ima_kexec_post_load(image);
+#endif
+ return machine_kexec_post_load(image);
+}
+
/*
* In file mode list of segments is prepared by kernel. Copy relevant
* data from user space, do error checking, prepare segment list
@@ -223,14 +219,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
const char __user *cmdline_ptr,
unsigned long cmdline_len, unsigned flags)
{
- int ret;
+ ssize_t ret;
void *ldata;
ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
- INT_MAX, NULL, READING_KEXEC_IMAGE);
+ KEXEC_FILE_SIZE_MAX, NULL,
+ READING_KEXEC_IMAGE);
if (ret < 0)
return ret;
image->kernel_buf_len = ret;
+ kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -247,7 +246,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
- INT_MAX, NULL,
+ KEXEC_FILE_SIZE_MAX, NULL,
READING_KEXEC_INITRAMFS);
if (ret < 0)
goto out;
@@ -255,6 +254,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = 0;
}
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+ image->force_dtb = flags & KEXEC_FILE_FORCE_DTB;
+
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
if (IS_ERR(image->cmdline_buf)) {
@@ -278,8 +280,13 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
+ /* If KHO is active, add its images to the list */
+ ret = kho_fill_kimage(image);
+ if (ret)
+ goto out;
+
+ /* Call image load handler */
+ ldata = kexec_image_load_default(image);
if (IS_ERR(ldata)) {
ret = PTR_ERR(ldata);
@@ -307,13 +314,16 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!image)
return -ENOMEM;
+ kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
cmdline_ptr, cmdline_len, flags);
@@ -355,11 +365,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
{
- int ret = 0, i;
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
struct kimage **dest_image, *image;
+ int ret = 0, i;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Make sure we have a legal set of flags */
@@ -368,15 +380,17 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH) {
+#ifdef CONFIG_CRASH_DUMP
+ if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- }
+ } else
+#endif
+ dest_image = &kexec_image;
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -394,6 +408,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+ image->hotplug_support = 1;
+#endif
+
ret = machine_kexec_prepare(image);
if (ret)
goto out;
@@ -410,25 +429,28 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
for (i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
+ kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
kimage_terminate(image);
- ret = machine_kexec_post_load(image);
+ ret = kexec_post_load(image, flags);
if (ret)
goto out;
+ kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
@@ -437,10 +459,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
@@ -452,11 +476,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
unsigned long temp_start, temp_end;
temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
+ temp_start = temp_end - kbuf->memsz + 1;
+ kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start);
do {
/* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
+ temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
if (temp_start < start || temp_start < kbuf->buf_min)
return 0;
@@ -472,6 +497,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
continue;
}
+ /* Make sure this does not conflict with exclude range */
+ if (arch_check_excluded_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
/* We found a suitable memory range */
break;
} while (1);
@@ -491,6 +522,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
temp_start = max(start, kbuf->buf_min);
+ kexec_random_range_start(temp_start, end, kbuf, &temp_start);
+
do {
temp_start = ALIGN(temp_start, kbuf->buf_align);
temp_end = temp_start + kbuf->memsz - 1;
@@ -506,6 +539,12 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
continue;
}
+ /* Make sure this does not conflict with exclude range */
+ if (arch_check_excluded_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
/* We found a suitable memory range */
break;
} while (1);
@@ -553,8 +592,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
phys_addr_t mstart, mend;
struct resource res = { };
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+#endif
/*
* Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
@@ -613,15 +654,56 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
static int kexec_walk_resources(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
+#endif
+ if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+ unsigned long mem;
+ struct page *p;
+
+ /* User space disabled CMA allocations, bail out. */
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ /* Skip CMA logic for crash kernel */
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -ENOMEM;
+
+ pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, nr_pages);
+ return -EBUSY;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+ return 0;
+}
+
/**
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
* @kbuf: Parameters for the memory search.
@@ -638,6 +720,21 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
+ /*
+ * If KHO is active, only use KHO scratch memory. All other memory
+ * could potentially be handed over.
+ */
+ ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ if (!kexec_alloc_contig(kbuf))
+ return 0;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -647,23 +744,10 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
- * arch_kexec_locate_mem_hole - Find free memory to place the segments.
- * @kbuf: Parameters for the memory search.
- *
- * On success, kbuf->mem will have the start address of the memory region found.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
-{
- return kexec_locate_mem_hole(kbuf);
-}
-
-/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
- * This function assumes that kexec_mutex is held.
+ * This function assumes that kexec_lock is held.
* On successful return, @kbuf->mem will have the physical address of
* the buffer in memory.
*
@@ -696,6 +780,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Ensure minimum alignment needed for segments. */
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+ kbuf->cma = NULL;
/* Walk the RAM ranges and allocate a suitable range for the buffer */
ret = arch_kexec_locate_mem_hole(kbuf);
@@ -708,6 +793,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
+ kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
kbuf->image->nr_segments++;
return 0;
}
@@ -715,56 +801,36 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Calculate and store the digest of segments */
static int kexec_calculate_store_digests(struct kimage *image)
{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
+ struct sha256_ctx sctx;
int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
+ size_t nullsz;
+ u8 digest[SHA256_DIGEST_SIZE];
void *zero_buf;
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
- if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY))
return 0;
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions) {
- ret = -ENOMEM;
- goto out_free_desc;
- }
-
- desc->tfm = tfm;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
+ if (!sha_regions)
+ return -ENOMEM;
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
+ sha256_init(&sctx);
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
+#ifdef CONFIG_CRASH_HOTPLUG
+ /* Exclude elfcorehdr segment to allow future changes via hotplug */
+ if (i == image->elfcorehdr_index)
+ continue;
+#endif
+
ksegment = &image->segment[i];
/*
* Skip purgatory as it will be modified once we put digest
@@ -773,10 +839,14 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (ksegment->kbuf == pi->purgatory_buf)
continue;
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
+ /*
+ * Skip the segment if ima_segment_index is set and matches
+ * the current index
+ */
+ if (check_ima_segment_index(image, i))
+ continue;
+
+ sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz);
/*
* Assume rest of the buffer is filled with zero and
@@ -788,48 +858,30 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (bytes > zero_buf_sz)
bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
+ sha256_update(&sctx, zero_buf, bytes);
nullsz -= bytes;
}
- if (ret)
- break;
-
sha_regions[j].start = ksegment->mem;
sha_regions[j].len = ksegment->memsz;
j++;
}
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
+ sha256_final(&sctx, digest);
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_sha_regions;
-out_free_digest:
- kfree(digest);
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
out_free_sha_regions:
vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
return ret;
}
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
@@ -906,6 +958,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
{
unsigned long bss_addr;
unsigned long offset;
+ size_t sechdrs_size;
Elf_Shdr *sechdrs;
int i;
@@ -913,11 +966,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* The section headers in kexec_purgatory are read-only. In order to
* have them modifiable make a temporary copy.
*/
- sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum));
+ sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum);
+ sechdrs = vzalloc(sechdrs_size);
if (!sechdrs)
return -ENOMEM;
- memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
- pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size);
pi->sechdrs = sechdrs;
offset = 0;
@@ -940,10 +993,22 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
}
offset = ALIGN(offset, align);
+
+ /*
+ * Check if the segment contains the entry point, if so,
+ * calculate the value of image->start based on it.
+ * If the compiler has produced more than one .text section
+ * (Eg: .text.hot), they are generally after the main .text
+ * section, and they shall not be used to calculate
+ * image->start. So do not re-calculate image->start if it
+ * is not set to the initial value, and warn the user so they
+ * have a chance to fix their purgatory's linker script.
+ */
if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
- + sechdrs[i].sh_size)) {
+ + sechdrs[i].sh_size) &&
+ !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
}
@@ -1176,185 +1241,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
-
-int crash_exclude_mem_range(struct crash_mem *mem,
- unsigned long long mstart, unsigned long long mend)
-{
- int i, j;
- unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
-
- for (i = 0; i < mem->nr_ranges; i++) {
- start = mem->ranges[i].start;
- end = mem->ranges[i].end;
- p_start = mstart;
- p_end = mend;
-
- if (mstart > end || mend < start)
- continue;
-
- /* Truncate any area outside of range */
- if (mstart < start)
- p_start = start;
- if (mend > end)
- p_end = end;
-
- /* Found completely overlapping range */
- if (p_start == start && p_end == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
-
- /*
- * Continue to check if there are another overlapping ranges
- * from the current position because of shifting the above
- * mem ranges.
- */
- i--;
- mem->nr_ranges--;
- continue;
- }
- mem->nr_ranges--;
- return 0;
- }
-
- if (p_start > start && p_end < end) {
- /* Split original range */
- mem->ranges[i].end = p_start - 1;
- temp_range.start = p_end + 1;
- temp_range.end = end;
- } else if (p_start != start)
- mem->ranges[i].end = p_start - 1;
- else
- mem->ranges[i].start = p_end + 1;
- break;
- }
-
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
- return 0;
-}
-
-int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
- void **addr, unsigned long *sz)
-{
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
- unsigned char *buf;
- unsigned int cpu, i;
- unsigned long long notes_addr;
- unsigned long mstart, mend;
-
- /* extra phdr for vmcoreinfo ELF note */
- nr_phdr = nr_cpus + 1;
- nr_phdr += mem->nr_ranges;
-
- /*
- * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
- * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
- * I think this is required by tools like gdb. So same physical
- * memory will be mapped in two ELF headers. One will contain kernel
- * text virtual addresses and other will have __va(physical) addresses.
- */
-
- nr_phdr++;
- elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
- elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
- buf = vzalloc(elf_sz);
- if (!buf)
- return -ENOMEM;
-
- ehdr = (Elf64_Ehdr *)buf;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
- ehdr->e_ident[EI_CLASS] = ELFCLASS64;
- ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
- ehdr->e_ident[EI_VERSION] = EV_CURRENT;
- ehdr->e_ident[EI_OSABI] = ELF_OSABI;
- memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
- ehdr->e_type = ET_CORE;
- ehdr->e_machine = ELF_ARCH;
- ehdr->e_version = EV_CURRENT;
- ehdr->e_phoff = sizeof(Elf64_Ehdr);
- ehdr->e_ehsize = sizeof(Elf64_Ehdr);
- ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
- /* Prepare one phdr of type PT_NOTE for each present CPU */
- for_each_present_cpu(cpu) {
- phdr->p_type = PT_NOTE;
- notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
- phdr->p_offset = phdr->p_paddr = notes_addr;
- phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
- (ehdr->e_phnum)++;
- phdr++;
- }
-
- /* Prepare one PT_NOTE header for vmcoreinfo */
- phdr->p_type = PT_NOTE;
- phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
- phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
- (ehdr->e_phnum)++;
- phdr++;
-
- /* Prepare PT_LOAD type program header for kernel text region */
- if (kernel_map) {
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (unsigned long) _text;
- phdr->p_filesz = phdr->p_memsz = _end - _text;
- phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
- ehdr->e_phnum++;
- phdr++;
- }
-
- /* Go through all the ranges in mem->ranges[] and prepare phdr */
- for (i = 0; i < mem->nr_ranges; i++) {
- mstart = mem->ranges[i].start;
- mend = mem->ranges[i].end;
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mstart;
-
- phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long) __va(mstart);
- phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
- phdr->p_align = 0;
- ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
- phdr++;
- }
-
- *addr = buf;
- *sz = elf_sz;
- return 0;
-}
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..228bb88c018b 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -4,16 +4,32 @@
#include <linux/kexec.h>
+struct kexec_segment;
+
struct kimage *do_kimage_alloc_init(void);
int sanity_check_segment_list(struct kimage *image);
void kimage_free_page_list(struct list_head *list);
void kimage_free(struct kimage *image);
-int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+int kimage_load_segment(struct kimage *image, int idx);
void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ int old = 0;
+ return atomic_try_cmpxchg_acquire(&__kexec_lock, &old, 1);
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
@@ -23,4 +39,20 @@ extern size_t kexec_purgatory_size;
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
#endif /* CONFIG_KEXEC_FILE */
+
+struct kexec_buf;
+
+#ifdef CONFIG_KEXEC_HANDOVER
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *));
+int kho_fill_kimage(struct kimage *image);
+#else
+static inline int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 1;
+}
+
+static inline int kho_fill_kimage(struct kimage *image) { return 0; }
+#endif /* CONFIG_KEXEC_HANDOVER */
#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 8f69772af77b..378088b07f46 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -26,30 +26,17 @@ asm (
" .popsection \n"
);
-extern char kernel_headers_data;
-extern char kernel_headers_data_end;
+extern char kernel_headers_data[];
+extern char kernel_headers_data_end[];
-static ssize_t
-ikheaders_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- memcpy(buf, &kernel_headers_data + off, len);
- return len;
-}
-
-static struct bin_attribute kheaders_attr __ro_after_init = {
- .attr = {
- .name = "kheaders.tar.xz",
- .mode = 0444,
- },
- .read = &ikheaders_read,
-};
+static struct bin_attribute kheaders_attr __ro_after_init =
+ __BIN_ATTR_SIMPLE_RO(kheaders.tar.xz, 0444);
static int __init ikheaders_init(void)
{
- kheaders_attr.size = (&kernel_headers_data_end -
- &kernel_headers_data);
+ kheaders_attr.private = kernel_headers_data;
+ kheaders_attr.size = (kernel_headers_data_end -
+ kernel_headers_data);
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 94cab8c9ce56..ab8f9fc1f0d1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -26,7 +26,6 @@
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
-#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
@@ -39,6 +38,8 @@
#include <linux/jump_label.h>
#include <linux/static_call.h>
#include <linux/perf_event.h>
+#include <linux/execmem.h>
+#include <linux/cleanup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -95,10 +96,6 @@ struct kprobe_insn_page {
char slot_used[];
};
-#define KPROBE_INSN_PAGE_SIZE(slots) \
- (offsetof(struct kprobe_insn_page, slot_used) + \
- (sizeof(char) * (slots)))
-
static int slots_per_page(struct kprobe_insn_cache *c)
{
return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -113,17 +110,17 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
/*
- * Use module_alloc() so this page is within +/- 2GB of where the
+ * Use execmem_alloc() so this page is within +/- 2GB of where the
* kernel image and loaded module images reside. This is required
* for most of the architectures.
* (e.g. x86-64 needs this to handle the %rip-relative fixups.)
*/
- return module_alloc(PAGE_SIZE);
+ return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
}
static void free_insn_page(void *page)
{
- module_memfree(page);
+ execmem_free(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
@@ -138,51 +135,49 @@ struct kprobe_insn_cache kprobe_insn_slots = {
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
- * __get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
+ * __get_insn_slot - Find a slot on an executable page for an instruction.
+ * @c: Pointer to kprobe instruction cache
+ *
+ * Description: Locates available slot on existing executable pages,
+ * allocates an executable page if there's no room on existing ones.
+ * Return: Pointer to instruction slot on success, NULL on failure.
*/
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
- kprobe_opcode_t *slot = NULL;
/* Since the slot array is not protected by rcu, we need a mutex */
- mutex_lock(&c->mutex);
- retry:
- rcu_read_lock();
- list_for_each_entry_rcu(kip, &c->pages, list) {
- if (kip->nused < slots_per_page(c)) {
- int i;
-
- for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_CLEAN) {
- kip->slot_used[i] = SLOT_USED;
- kip->nused++;
- slot = kip->insns + (i * c->insn_size);
- rcu_read_unlock();
- goto out;
+ guard(mutex)(&c->mutex);
+ do {
+ guard(rcu)();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (kip->nused < slots_per_page(c)) {
+ int i;
+
+ for (i = 0; i < slots_per_page(c); i++) {
+ if (kip->slot_used[i] == SLOT_CLEAN) {
+ kip->slot_used[i] = SLOT_USED;
+ kip->nused++;
+ return kip->insns + (i * c->insn_size);
+ }
}
+ /* kip->nused is broken. Fix it. */
+ kip->nused = slots_per_page(c);
+ WARN_ON(1);
}
- /* kip->nused is broken. Fix it. */
- kip->nused = slots_per_page(c);
- WARN_ON(1);
}
- }
- rcu_read_unlock();
-
/* If there are any garbage slots, collect it and try again. */
- if (c->nr_garbage && collect_garbage_slots(c) == 0)
- goto retry;
+ } while (c->nr_garbage && collect_garbage_slots(c) == 0);
/* All out of space. Need to allocate a new page. */
- kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
+ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL);
if (!kip)
- goto out;
+ return NULL;
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
- goto out;
+ return NULL;
}
INIT_LIST_HEAD(&kip->list);
memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
@@ -191,14 +186,12 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->ngarbage = 0;
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
- slot = kip->insns;
/* Record the perf ksymbol register event after adding the page */
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
PAGE_SIZE, false, c->sym);
-out:
- mutex_unlock(&c->mutex);
- return slot;
+
+ return kip->insns;
}
/* Return true if all garbages are collected, otherwise false. */
@@ -206,29 +199,29 @@ static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
- if (kip->nused == 0) {
+ if (kip->nused != 0)
+ return false;
+
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ if (!list_is_singular(&kip->list)) {
/*
- * Page is no longer in use. Free it unless
- * it's the last one. We keep the last one
- * so as not to have to set it up again the
- * next time somebody inserts a probe.
+ * Record perf ksymbol unregister event before removing
+ * the page.
*/
- if (!list_is_singular(&kip->list)) {
- /*
- * Record perf ksymbol unregister event before removing
- * the page.
- */
- perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
- (unsigned long)kip->insns, PAGE_SIZE, true,
- kip->cache->sym);
- list_del_rcu(&kip->list);
- synchronize_rcu();
- kip->cache->free(kip->insns);
- kfree(kip);
- }
- return true;
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
+ list_del_rcu(&kip->list);
+ synchronize_rcu();
+ kip->cache->free(kip->insns);
+ kfree(kip);
}
- return false;
+ return true;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -253,25 +246,35 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
-void __free_insn_slot(struct kprobe_insn_cache *c,
- kprobe_opcode_t *slot, int dirty)
+static long __find_insn_page(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, struct kprobe_insn_page **pkip)
{
- struct kprobe_insn_page *kip;
+ struct kprobe_insn_page *kip = NULL;
long idx;
- mutex_lock(&c->mutex);
- rcu_read_lock();
+ guard(rcu)();
list_for_each_entry_rcu(kip, &c->pages, list) {
idx = ((long)slot - (long)kip->insns) /
(c->insn_size * sizeof(kprobe_opcode_t));
- if (idx >= 0 && idx < slots_per_page(c))
- goto out;
+ if (idx >= 0 && idx < slots_per_page(c)) {
+ *pkip = kip;
+ return idx;
+ }
}
/* Could not find this slot. */
WARN_ON(1);
- kip = NULL;
-out:
- rcu_read_unlock();
+ *pkip = NULL;
+ return -1;
+}
+
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
+{
+ struct kprobe_insn_page *kip = NULL;
+ long idx;
+
+ guard(mutex)(&c->mutex);
+ idx = __find_insn_page(c, slot, &kip);
/* Mark and sweep: this may sleep */
if (kip) {
/* Check double free */
@@ -285,7 +288,6 @@ out:
collect_one_slot(kip, idx);
}
}
- mutex_unlock(&c->mutex);
}
/*
@@ -353,8 +355,8 @@ struct kprobe_insn_cache kprobe_optinsn_slots = {
/* .insn_size is initialized later */
.nr_garbage = 0,
};
-#endif
-#endif
+#endif /* CONFIG_OPTPROBES */
+#endif /* __ARCH_WANT_KPROBES_INSN_SLOT */
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
@@ -458,7 +460,7 @@ static inline int kprobe_optready(struct kprobe *p)
}
/* Return true if the kprobe is disarmed. Note: p must be on hash list */
-static inline bool kprobe_disarmed(struct kprobe *p)
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -555,17 +557,15 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop on 'freeing_list' for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
@@ -606,47 +606,43 @@ static void kick_kprobe_optimizer(void)
/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
- mutex_lock(&kprobe_mutex);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(mutex)(&kprobe_mutex);
- /*
- * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
- * kprobes before waiting for quiesence period.
- */
- do_unoptimize_kprobes();
+ scoped_guard(cpus_read_lock) {
+ guard(mutex)(&text_mutex);
- /*
- * Step 2: Wait for quiesence period to ensure all potentially
- * preempted tasks to have normally scheduled. Because optprobe
- * may modify multiple instructions, there is a chance that Nth
- * instruction is preempted. In that case, such tasks can return
- * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
- * Note that on non-preemptive kernel, this is transparently converted
- * to synchronoze_sched() to wait for all interrupts to have completed.
- */
- synchronize_rcu_tasks();
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes();
- /* Step 3: Optimize kprobes after quiesence period */
- do_optimize_kprobes();
+ /*
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
+ */
+ synchronize_rcu_tasks();
- /* Step 4: Free cleaned kprobes after quiesence period */
- do_free_cleaned_kprobes();
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes();
+ }
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
-
- mutex_unlock(&kprobe_mutex);
}
-/* Wait for completing optimization and unoptimization */
-void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer_locked(void)
{
- mutex_lock(&kprobe_mutex);
+ lockdep_assert_held(&kprobe_mutex);
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
@@ -658,11 +654,17 @@ void wait_for_kprobe_optimizer(void)
mutex_lock(&kprobe_mutex);
}
+}
- mutex_unlock(&kprobe_mutex);
+/* Wait for completing optimization and unoptimization */
+void wait_for_kprobe_optimizer(void)
+{
+ guard(mutex)(&kprobe_mutex);
+
+ wait_for_kprobe_optimizer_locked();
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -797,14 +799,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
@@ -859,29 +860,24 @@ static void try_to_optimize_kprobe(struct kprobe *p)
return;
/* For preparing optimization, jump_label_text_reserved() is called. */
- cpus_read_lock();
- jump_label_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
ap = alloc_aggr_kprobe(p);
if (!ap)
- goto out;
+ return;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
- goto out;
+ return;
}
init_aggr_kprobe(ap, p);
optimize_kprobe(ap); /* This just kicks optimizer thread. */
-
-out:
- mutex_unlock(&text_mutex);
- jump_label_unlock();
- cpus_read_unlock();
}
static void optimize_all_kprobes(void)
@@ -890,10 +886,10 @@ static void optimize_all_kprobes(void)
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
- goto out;
+ return;
cpus_read_lock();
kprobes_allow_optimization = true;
@@ -905,8 +901,6 @@ static void optimize_all_kprobes(void)
}
cpus_read_unlock();
pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
-out:
- mutex_unlock(&kprobe_mutex);
}
#ifdef CONFIG_SYSCTL
@@ -916,12 +910,10 @@ static void unoptimize_all_kprobes(void)
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If optimization is already prohibited, just return. */
- if (!kprobes_allow_optimization) {
- mutex_unlock(&kprobe_mutex);
+ if (!kprobes_allow_optimization)
return;
- }
cpus_read_lock();
kprobes_allow_optimization = false;
@@ -933,22 +925,20 @@ static void unoptimize_all_kprobes(void)
}
}
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
-
/* Wait for unoptimizing completion. */
- wait_for_kprobe_optimizer();
+ wait_for_kprobe_optimizer_locked();
pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
static int sysctl_kprobes_optimization;
-static int proc_kprobes_optimization_handler(struct ctl_table *table,
+static int proc_kprobes_optimization_handler(const struct ctl_table *table,
int write, void *buffer,
size_t *length, loff_t *ppos)
{
int ret;
- mutex_lock(&kprobe_sysctl_mutex);
+ guard(mutex)(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -956,12 +946,11 @@ static int proc_kprobes_optimization_handler(struct ctl_table *table,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
-static struct ctl_table kprobe_sysctls[] = {
+static const struct ctl_table kprobe_sysctls[] = {
{
.procname = "kprobes-optimization",
.data = &sysctl_kprobes_optimization,
@@ -971,7 +960,6 @@ static struct ctl_table kprobe_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static void __init kprobe_sysctls_init(void)
@@ -1032,7 +1020,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
#define __arm_kprobe(p) arch_arm_kprobe(p)
#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
#define kprobe_disarmed(p) kprobe_disabled(p)
-#define wait_for_kprobe_optimizer() do {} while (0)
+#define wait_for_kprobe_optimizer_locked() \
+ lockdep_assert_held(&kprobe_mutex)
static int reuse_unused_kprobe(struct kprobe *ap)
{
@@ -1071,11 +1060,12 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
+bool kprobe_ftrace_disabled;
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
lockdep_assert_held(&kprobe_mutex);
@@ -1085,20 +1075,18 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret))
- goto err_ftrace;
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ /*
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
+ */
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
+ return ret;
+ }
}
(*cnt)++;
return ret;
-
-err_ftrace:
- /*
- * At this point, sinec ops is not registered, we should be sefe from
- * registering empty filter.
- */
- ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
- return ret;
}
static int arm_kprobe_ftrace(struct kprobe *p)
@@ -1113,7 +1101,7 @@ static int arm_kprobe_ftrace(struct kprobe *p)
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
lockdep_assert_held(&kprobe_mutex);
@@ -1139,6 +1127,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
+
+void kprobe_ftrace_kill(void)
+{
+ kprobe_ftrace_disabled = true;
+}
#else /* !CONFIG_KPROBES_ON_FTRACE */
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
@@ -1165,12 +1158,9 @@ static int arm_kprobe(struct kprobe *kp)
if (unlikely(kprobe_ftrace(kp)))
return arm_kprobe_ftrace(kp);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__arm_kprobe(kp);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
-
return 0;
}
@@ -1179,12 +1169,9 @@ static int disarm_kprobe(struct kprobe *kp, bool reopt)
if (unlikely(kprobe_ftrace(kp)))
return disarm_kprobe_ftrace(kp);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__disarm_kprobe(kp, reopt);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
-
return 0;
}
@@ -1237,27 +1224,6 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-static void free_rp_inst_rcu(struct rcu_head *head)
-{
- struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
-
- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
-}
-NOKPROBE_SYMBOL(free_rp_inst_rcu);
-
-static void recycle_rp_inst(struct kretprobe_instance *ri)
-{
- struct kretprobe *rp = get_kretprobe(ri);
-
- if (likely(rp))
- freelist_add(&ri->freelist, &rp->freelist);
- else
- call_rcu(&ri->rcu, free_rp_inst_rcu);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
};
@@ -1278,56 +1244,6 @@ void kprobe_busy_end(void)
preempt_enable();
}
-/*
- * This function is called from delayed_put_task_struct() when a task is
- * dead and cleaned up to recycle any kretprobe instances associated with
- * this task. These left over instances represent probed functions that
- * have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
- struct kretprobe_instance *ri;
- struct llist_node *node;
-
- /* Early boot, not yet initialized. */
- if (unlikely(!kprobes_initialized))
- return;
-
- kprobe_busy_begin();
-
- node = __llist_del_all(&tk->kretprobe_instances);
- while (node) {
- ri = container_of(node, struct kretprobe_instance, llist);
- node = node->next;
-
- recycle_rp_inst(ri);
- }
-
- kprobe_busy_end();
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
-
- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
-
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
-}
-
/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
@@ -1372,62 +1288,55 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
int ret = 0;
struct kprobe *ap = orig_p;
- cpus_read_lock();
-
- /* For preparing optimization, jump_label_text_reserved() is called */
- jump_label_lock();
- mutex_lock(&text_mutex);
-
- if (!kprobe_aggrprobe(orig_p)) {
- /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
- ap = alloc_aggr_kprobe(orig_p);
- if (!ap) {
- ret = -ENOMEM;
- goto out;
+ scoped_guard(cpus_read_lock) {
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
+
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
+ ap = alloc_aggr_kprobe(orig_p);
+ if (!ap)
+ return -ENOMEM;
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap)) {
+ /* This probe is going to die. Rescue it */
+ ret = reuse_unused_kprobe(ap);
+ if (ret)
+ return ret;
}
- init_aggr_kprobe(ap, orig_p);
- } else if (kprobe_unused(ap)) {
- /* This probe is going to die. Rescue it */
- ret = reuse_unused_kprobe(ap);
- if (ret)
- goto out;
- }
- if (kprobe_gone(ap)) {
- /*
- * Attempting to insert new probe at the same location that
- * had a probe in the module vaddr area which already
- * freed. So, the instruction slot has already been
- * released. We need a new slot for the new probe.
- */
- ret = arch_prepare_kprobe(ap);
- if (ret)
+ if (kprobe_gone(ap)) {
/*
- * Even if fail to allocate new slot, don't need to
- * free the 'ap'. It will be used next time, or
- * freed by unregister_kprobe().
+ * Attempting to insert new probe at the same location that
+ * had a probe in the module vaddr area which already
+ * freed. So, the instruction slot has already been
+ * released. We need a new slot for the new probe.
*/
- goto out;
-
- /* Prepare optimized instructions if possible. */
- prepare_optimized_kprobe(ap);
+ ret = arch_prepare_kprobe(ap);
+ if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
+ */
+ return ret;
- /*
- * Clear gone flag to prevent allocating new slot again, and
- * set disabled flag because it is not armed yet.
- */
- ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
- | KPROBE_FLAG_DISABLED;
- }
+ /* Prepare optimized instructions if possible. */
+ prepare_optimized_kprobe(ap);
- /* Copy the insn slot of 'p' to 'ap'. */
- copy_kprobe(ap, p);
- ret = add_new_kprobe(ap, p);
+ /*
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
+ */
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
+ }
-out:
- mutex_unlock(&text_mutex);
- jump_label_unlock();
- cpus_read_unlock();
+ /* Copy the insn slot of 'p' to 'ap'. */
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+ }
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1489,34 +1398,79 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
* If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
- const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
- goto invalid;
+ return ERR_PTR(-EINVAL);
if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + offset);
- if (addr)
- return addr;
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
+ if (!addr)
+ return ERR_PTR(-EINVAL);
-invalid:
- return ERR_PTR(-EINVAL);
+ return addr;
}
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
- return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+ bool on_func_entry;
+
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
/*
@@ -1533,15 +1487,15 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p)
if (unlikely(!ap))
return NULL;
- if (p != ap) {
- list_for_each_entry(list_p, &ap->list, list)
- if (list_p == p)
- /* kprobe p is a valid probe */
- goto valid;
- return NULL;
- }
-valid:
- return ap;
+ if (p == ap)
+ return ap;
+
+ list_for_each_entry(list_p, &ap->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ return ap;
+
+ return NULL;
}
/*
@@ -1550,34 +1504,39 @@ valid:
*/
static inline int warn_kprobe_rereg(struct kprobe *p)
{
- int ret = 0;
+ guard(mutex)(&kprobe_mutex);
- mutex_lock(&kprobe_mutex);
if (WARN_ON_ONCE(__get_valid_kprobe(p)))
- ret = -EINVAL;
- mutex_unlock(&kprobe_mutex);
+ return -EINVAL;
- return ret;
+ return 0;
}
static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
-#else /* !CONFIG_KPROBES_ON_FTRACE */
+#else
return -EINVAL;
#endif
}
return 0;
}
+static bool is_cfi_preamble_symbol(unsigned long addr)
+{
+ char symbuf[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symbuf))
+ return false;
+
+ return str_has_prefix(symbuf, "__cfi_") ||
+ str_has_prefix(symbuf, "__pfx_");
+}
+
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
@@ -1586,92 +1545,70 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = check_ftrace_location(p);
if (ret)
return ret;
- jump_label_lock();
- preempt_disable();
- /* Ensure it is not in reserved area nor out of text */
- if (!kernel_text_address((unsigned long) p->addr) ||
- within_kprobe_blacklist((unsigned long) p->addr) ||
- jump_label_text_reserved(p->addr, p->addr) ||
- static_call_text_reserved(p->addr, p->addr) ||
- find_bug((unsigned long)p->addr)) {
- ret = -EINVAL;
- goto out;
- }
+ guard(jump_label_lock)();
+
+ /* Ensure the address is in a text area, and find a module if exists. */
+ *probed_mod = NULL;
+ if (!core_kernel_text((unsigned long) p->addr)) {
+ guard(rcu)();
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (!(*probed_mod))
+ return -EINVAL;
- /* Check if 'p' is probing a module. */
- *probed_mod = __module_text_address((unsigned long) p->addr);
- if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
- if (unlikely(!try_module_get(*probed_mod))) {
- ret = -ENOENT;
- goto out;
- }
+ if (unlikely(!try_module_get(*probed_mod)))
+ return -ENOENT;
+ }
+ /* Ensure it is not in reserved area. */
+ if (in_gate_area_no_mm((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
+ jump_label_text_reserved(p->addr, p->addr) ||
+ static_call_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr) ||
+ is_cfi_preamble_symbol((unsigned long)p->addr)) {
+ module_put(*probed_mod);
+ return -EINVAL;
+ }
+ /* Get module refcount and reject __init functions for loaded modules. */
+ if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
/*
* If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
- (*probed_mod)->state != MODULE_STATE_COMING) {
+ !module_is_coming(*probed_mod)) {
module_put(*probed_mod);
- *probed_mod = NULL;
- ret = -ENOENT;
+ return -ENOENT;
}
}
-out:
- preempt_enable();
- jump_label_unlock();
- return ret;
+ return 0;
}
-int register_kprobe(struct kprobe *p)
+static int __register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
- struct module *probed_mod;
- kprobe_opcode_t *addr;
-
- /* Adjust probe address from symbol */
- addr = kprobe_addr(p);
- if (IS_ERR(addr))
- return PTR_ERR(addr);
- p->addr = addr;
-
- ret = warn_kprobe_rereg(p);
- if (ret)
- return ret;
-
- /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
- p->flags &= KPROBE_FLAG_DISABLED;
- p->nmissed = 0;
- INIT_LIST_HEAD(&p->list);
- ret = check_kprobe_address_safe(p, &probed_mod);
- if (ret)
- return ret;
-
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
old_p = get_kprobe(p->addr);
- if (old_p) {
+ if (old_p)
/* Since this may unoptimize 'old_p', locking 'text_mutex'. */
- ret = register_aggr_kprobe(old_p, p);
- goto out;
- }
+ return register_aggr_kprobe(old_p, p);
- cpus_read_lock();
- /* Prevent text modification */
- mutex_lock(&text_mutex);
- ret = prepare_kprobe(p);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
- if (ret)
- goto out;
+ scoped_guard(cpus_read_lock) {
+ /* Prevent text modification */
+ guard(mutex)(&text_mutex);
+ ret = prepare_kprobe(p);
+ if (ret)
+ return ret;
+ }
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
@@ -1682,14 +1619,43 @@ int register_kprobe(struct kprobe *p)
if (ret) {
hlist_del_rcu(&p->hlist);
synchronize_rcu();
- goto out;
}
}
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
-out:
- mutex_unlock(&kprobe_mutex);
+ return 0;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret;
+ struct module *probed_mod;
+ kprobe_opcode_t *addr;
+ bool on_func_entry;
+
+ /* Canonicalize probe address from symbol */
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+ p->addr = addr;
+
+ ret = warn_kprobe_rereg(p);
+ if (ret)
+ return ret;
+
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+ p->nmissed = 0;
+ INIT_LIST_HEAD(&p->list);
+
+ ret = check_kprobe_address_safe(p, &probed_mod);
+ if (ret)
+ return ret;
+
+ ret = __register_kprobe(p);
if (probed_mod)
module_put(probed_mod);
@@ -1728,27 +1694,29 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
if (unlikely(orig_p == NULL))
return ERR_PTR(-EINVAL);
- if (!kprobe_disabled(p)) {
- /* Disable probe if it is a child probe */
- if (p != orig_p)
- p->flags |= KPROBE_FLAG_DISABLED;
+ if (kprobe_disabled(p))
+ return orig_p;
- /* Try to disarm and disable this/parent probe */
- if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- /*
- * If 'kprobes_all_disarmed' is set, 'orig_p'
- * should have already been disarmed, so
- * skip unneed disarming process.
- */
- if (!kprobes_all_disarmed) {
- ret = disarm_kprobe(orig_p, true);
- if (ret) {
- p->flags &= ~KPROBE_FLAG_DISABLED;
- return ERR_PTR(ret);
- }
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ /*
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
+ */
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
+ ret = disarm_kprobe(orig_p, true);
+ if (ret) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ return ERR_PTR(ret);
}
- orig_p->flags |= KPROBE_FLAG_DISABLED;
}
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
}
return orig_p;
@@ -1766,49 +1734,54 @@ static int __unregister_kprobe_top(struct kprobe *p)
if (IS_ERR(ap))
return PTR_ERR(ap);
- if (ap == p)
- /*
- * This probe is an independent(and non-optimized) kprobe
- * (not an aggrprobe). Remove from the hash list.
- */
- goto disarmed;
+ WARN_ON(ap != p && !kprobe_aggrprobe(ap));
- /* Following process expects this probe is an aggrprobe */
- WARN_ON(!kprobe_aggrprobe(ap));
-
- if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * If the probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe), the last kprobe on the aggrprobe, or
+ * kprobe is already disarmed, just remove from the hash list.
+ */
+ if (ap == p ||
+ (list_is_singular(&ap->list) && kprobe_disarmed(ap))) {
/*
* !disarmed could be happen if the probe is under delayed
* unoptimizing.
*/
- goto disarmed;
- else {
- /* If disabling probe has special handlers, update aggrprobe */
- if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry(list_p, &ap->list, list) {
- if ((list_p != p) && (list_p->post_handler))
- goto noclean;
- }
- ap->post_handler = NULL;
+ hlist_del_rcu(&ap->hlist);
+ return 0;
+ }
+
+ /* If disabling probe has special handlers, update aggrprobe */
+ if (p->post_handler && !kprobe_gone(p)) {
+ list_for_each_entry(list_p, &ap->list, list) {
+ if ((list_p != p) && (list_p->post_handler))
+ break;
}
-noclean:
- /*
- * Remove from the aggrprobe: this path will do nothing in
- * __unregister_kprobe_bottom().
- */
- list_del_rcu(&p->list);
- if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /* No other probe has post_handler */
+ if (list_entry_is_head(list_p, &ap->list, list)) {
/*
- * Try to optimize this probe again, because post
- * handler may have been changed.
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
*/
- optimize_kprobe(ap);
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
+ }
}
- return 0;
-disarmed:
- hlist_del_rcu(&ap->hlist);
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
+ list_del_rcu(&p->list);
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
return 0;
+
}
static void __unregister_kprobe_bottom(struct kprobe *p)
@@ -1857,12 +1830,11 @@ void unregister_kprobes(struct kprobe **kps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
- if (__unregister_kprobe_top(kps[i]) < 0)
- kps[i]->addr = NULL;
- mutex_unlock(&kprobe_mutex);
-
+ scoped_guard(mutex, &kprobe_mutex) {
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(kps[i]) < 0)
+ kps[i]->addr = NULL;
+ }
synchronize_rcu();
for (i = 0; i < num; i++)
if (kps[i]->addr)
@@ -1884,6 +1856,81 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+ struct kretprobe_instance *ri = nod;
+
+ ri->rph = context;
+ return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
+static void free_rp_inst_rcu(struct rcu_head *head)
+{
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;
+
+ objpool_drop(ri, &rph->pool);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
+
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+
+ if (likely(rp))
+ objpool_push(ri, &rp->rph->pool);
+ else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct llist_node *node;
+
+ /* Early boot, not yet initialized. */
+ if (unlikely(!kprobes_initialized))
+ return;
+
+ kprobe_busy_begin();
+
+ node = __llist_del_all(&tk->kretprobe_instances);
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_holder *rph = rp->rph;
+
+ if (!rph)
+ return;
+ rp->rph = NULL;
+ objpool_fini(&rph->pool);
+}
+
/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
struct llist_node **cur)
@@ -1926,7 +1973,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
struct llist_node **cur)
{
- struct kretprobe_instance *ri = NULL;
+ struct kretprobe_instance *ri;
kprobe_opcode_t *ret;
if (WARN_ON_ONCE(!cur))
@@ -1955,9 +2002,9 @@ void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
void *frame_pointer)
{
- kprobe_opcode_t *correct_ret_addr = NULL;
struct kretprobe_instance *ri = NULL;
struct llist_node *first, *node = NULL;
+ kprobe_opcode_t *correct_ret_addr;
struct kretprobe *rp;
/* Find correct address and all nodes for this frame. */
@@ -2023,19 +2070,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_holder *rph = rp->rph;
struct kretprobe_instance *ri;
- struct freelist_node *fn;
- fn = freelist_try_get(&rp->freelist);
- if (!fn) {
+ ri = objpool_pop(&rph->pool);
+ if (!ri) {
rp->nmissed++;
return 0;
}
- ri = container_of(fn, struct kretprobe_instance, freelist);
-
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- freelist_add(&ri->freelist, &rp->freelist);
+ objpool_push(ri, &rph->pool);
return 0;
}
@@ -2046,11 +2091,58 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
+
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ unsigned long ret_addr,
+ struct pt_regs *regs)
{
- return !offset;
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data) || !rp->handler)
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/**
* kprobe_on_func_entry() -- check whether given address is function entry
@@ -2067,15 +2159,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
*/
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
- kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
if (IS_ERR(kp_addr))
return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
- return -ENOENT;
-
- if (!arch_kprobe_on_func_entry(offset))
+ if (!on_func_entry)
return -EINVAL;
return 0;
@@ -2084,7 +2174,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
int register_kretprobe(struct kretprobe *rp)
{
int ret;
- struct kretprobe_instance *inst;
int i;
void *addr;
@@ -2114,37 +2203,42 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
+
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+ sizeof(struct kretprobe_instance) +
+ rp->data_size, rp->maxactive);
+ if (IS_ERR(rp->rh))
+ return PTR_ERR(rp->rh);
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
}
- rp->freelist.head = NULL;
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
return -ENOMEM;
- rp->rph->rp = rp;
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
- if (inst == NULL) {
- refcount_set(&rp->rph->ref, i);
- free_rp_inst(rp);
- return -ENOMEM;
- }
- inst->rph = rp->rph;
- freelist_add(&inst->freelist, &rp->freelist);
+ if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
}
- refcount_set(&rp->rph->ref, i);
-
+ rcu_assign_pointer(rp->rph->rp, rp);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2179,19 +2273,25 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
for (i = 0; i < num; i++) {
+ guard(mutex)(&kprobe_mutex);
+
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
- rps[i]->rph->rp = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
+ rcu_assign_pointer(rps[i]->rph->rp, NULL);
+#endif
}
- mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
free_rp_inst(rps[i]);
+#endif
}
}
}
@@ -2235,6 +2335,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2251,31 +2359,19 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
-
- /*
- * The module is going away. We should disarm the kprobe which
- * is using ftrace, because ftrace framework is still available at
- * 'MODULE_STATE_GOING' notification.
- */
- if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
- disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
- int ret = 0;
struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Disable this kprobe */
p = __disable_kprobe(kp);
- if (IS_ERR(p))
- ret = PTR_ERR(p);
- mutex_unlock(&kprobe_mutex);
- return ret;
+ return IS_ERR(p) ? PTR_ERR(p) : 0;
}
EXPORT_SYMBOL_GPL(disable_kprobe);
@@ -2285,20 +2381,16 @@ int enable_kprobe(struct kprobe *kp)
int ret = 0;
struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
- ret = -EINVAL;
- goto out;
- }
+ if (unlikely(p == NULL))
+ return -EINVAL;
- if (kprobe_gone(kp)) {
+ if (kprobe_gone(kp))
/* This kprobe has gone, we couldn't enable it. */
- ret = -EINVAL;
- goto out;
- }
+ return -EINVAL;
if (p != kp)
kp->flags &= ~KPROBE_FLAG_DISABLED;
@@ -2306,11 +2398,12 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
- if (ret)
+ if (ret) {
p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
-out:
- mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);
@@ -2359,24 +2452,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
return 0;
}
-/* Remove all symbols in given area from kprobe blacklist */
-static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
-{
- struct kprobe_blacklist_entry *ent, *n;
-
- list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
- if (ent->start_addr < start || ent->start_addr >= end)
- continue;
- list_del(&ent->list);
- kfree(ent);
- }
-}
-
-static void kprobe_remove_ksym_blacklist(unsigned long entry)
-{
- kprobe_remove_area_blacklist(entry, entry + 1);
-}
-
int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
char *type, char *sym)
{
@@ -2441,6 +2516,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret ? : arch_populate_kprobe_blacklist();
}
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
+{
+ struct kprobe_blacklist_entry *ent, *n;
+
+ list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
+ if (ent->start_addr < start || ent->start_addr >= end)
+ continue;
+ list_del(&ent->list);
+ kfree(ent);
+ }
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+ kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
@@ -2497,11 +2591,11 @@ static int kprobes_module_callback(struct notifier_block *nb,
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
- if (val == MODULE_STATE_COMING) {
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
+
+ if (val == MODULE_STATE_COMING)
add_module_kprobe_blacklist(mod);
- mutex_unlock(&kprobe_mutex);
- }
+
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
@@ -2511,7 +2605,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
* notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
- mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist)
@@ -2534,7 +2627,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
- mutex_unlock(&kprobe_mutex);
return NOTIFY_DONE;
}
@@ -2543,6 +2635,17 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
+static int kprobe_register_module_notifier(void)
+{
+ return register_module_notifier(&kprobe_module_nb);
+}
+#else
+static int kprobe_register_module_notifier(void)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
void kprobe_free_init_mem(void)
{
void *start = (void *)(&__init_begin);
@@ -2551,7 +2654,7 @@ void kprobe_free_init_mem(void)
struct kprobe *p;
int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Kill all kprobes on initmem because the target code has been freed. */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -2561,13 +2664,11 @@ void kprobe_free_init_mem(void)
kill_kprobe(p);
}
}
-
- mutex_unlock(&kprobe_mutex);
}
static int __init init_kprobes(void)
{
- int i, err = 0;
+ int i, err;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
@@ -2602,7 +2703,7 @@ static int __init init_kprobes(void)
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
if (!err)
- err = register_module_notifier(&kprobe_module_nb);
+ err = kprobe_register_module_notifier();
kprobes_initialized = (err == 0);
kprobe_sysctls_init();
@@ -2679,7 +2780,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
@@ -2758,11 +2859,11 @@ static int arm_all_kprobes(void)
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are armed, just return */
if (!kprobes_all_disarmed)
- goto already_enabled;
+ return 0;
/*
* optimize_kprobe() called by arm_kprobe() checks
@@ -2792,8 +2893,6 @@ static int arm_all_kprobes(void)
else
pr_info("Kprobes globally enabled\n");
-already_enabled:
- mutex_unlock(&kprobe_mutex);
return ret;
}
@@ -2804,13 +2903,11 @@ static int disarm_all_kprobes(void)
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed) {
- mutex_unlock(&kprobe_mutex);
+ if (kprobes_all_disarmed)
return 0;
- }
kprobes_all_disarmed = true;
@@ -2835,11 +2932,8 @@ static int disarm_all_kprobes(void)
else
pr_info("Kprobes globally disabled\n");
- mutex_unlock(&kprobe_mutex);
-
/* Wait for disarming all kprobes by optimizer */
- wait_for_kprobe_optimizer();
-
+ wait_for_kprobe_optimizer_locked();
return ret;
}
diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c
new file mode 100644
index 000000000000..d4449884084c
--- /dev/null
+++ b/kernel/kstack_erase.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code fills the used part of the kernel stack with a poison value
+ * before returning to userspace. It's part of the STACKLEAK feature
+ * ported from grsecurity/PaX.
+ *
+ * Author: Alexander Popov <alex.popov@linux.com>
+ *
+ * KSTACK_ERASE reduces the information which kernel stack leak bugs can
+ * reveal and blocks some uninitialized stack variable attacks.
+ */
+
+#include <linux/kstack_erase.h>
+#include <linux/kprobes.h>
+
+#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE
+#include <linux/jump_label.h>
+#include <linux/string_choices.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
+
+#ifdef CONFIG_SYSCTL
+static int stack_erasing_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+ int state = !static_branch_unlikely(&stack_erasing_bypass);
+ int prev_state = state;
+ struct ctl_table table_copy = *table;
+
+ table_copy.data = &state;
+ ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
+ state = !!state;
+ if (ret || !write || state == prev_state)
+ return ret;
+
+ if (state)
+ static_branch_disable(&stack_erasing_bypass);
+ else
+ static_branch_enable(&stack_erasing_bypass);
+
+ pr_warn("stackleak: kernel stack erasing is %s\n",
+ str_enabled_disabled(state));
+ return ret;
+}
+static const struct ctl_table stackleak_sysctls[] = {
+ {
+ .procname = "stack_erasing",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = stack_erasing_sysctl,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init stackleak_sysctls_init(void)
+{
+ register_sysctl_init("kernel", stackleak_sysctls);
+ return 0;
+}
+late_initcall(stackleak_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
+#else
+#define skip_erasing() false
+#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */
+
+#ifndef __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+ unsigned long erase_high,
+ unsigned long poison)
+{
+ while (erase_low < erase_high) {
+ *(unsigned long *)erase_low = poison;
+ erase_low += sizeof(unsigned long);
+ }
+}
+#endif
+
+static __always_inline void __stackleak_erase(bool on_task_stack)
+{
+ const unsigned long task_stack_low = stackleak_task_low_bound(current);
+ const unsigned long task_stack_high = stackleak_task_high_bound(current);
+ unsigned long erase_low, erase_high;
+
+ erase_low = stackleak_find_top_of_poison(task_stack_low,
+ current->lowest_stack);
+
+#ifdef CONFIG_KSTACK_ERASE_METRICS
+ current->prev_lowest_stack = erase_low;
+#endif
+
+ /*
+ * Write poison to the task's stack between 'erase_low' and
+ * 'erase_high'.
+ *
+ * If we're running on a different stack (e.g. an entry trampoline
+ * stack) we can erase everything below the pt_regs at the top of the
+ * task stack.
+ *
+ * If we're running on the task stack itself, we must not clobber any
+ * stack used by this function and its caller. We assume that this
+ * function has a fixed-size stack frame, and the current stack pointer
+ * doesn't change while we write poison.
+ */
+ if (on_task_stack)
+ erase_high = current_stack_pointer;
+ else
+ erase_high = task_stack_high;
+
+ __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON);
+
+ /* Reset the 'lowest_stack' value for the next syscall */
+ current->lowest_stack = task_stack_high;
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(false);
+}
+
+void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void)
+{
+ unsigned long sp = current_stack_pointer;
+
+ /*
+ * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than
+ * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in
+ * stackleak_erase() unreliable. Let's prevent that.
+ */
+ BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH);
+
+ /* 'lowest_stack' should be aligned on the register width boundary */
+ sp = ALIGN(sp, sizeof(unsigned long));
+ if (sp < current->lowest_stack &&
+ sp >= stackleak_task_low_bound(current)) {
+ current->lowest_stack = sp;
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_stack_depth);
diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c
new file mode 100644
index 000000000000..cf1a73cbf2f6
--- /dev/null
+++ b/kernel/ksyms_common.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ksyms_common.c: A split of kernel/kallsyms.c
+ * Contains a few generic function definations independent of config KALLSYMS.
+ */
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+bool kallsyms_show_value(const struct cred *cred)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return true;
+ fallthrough;
+ case 1:
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..a9e6354d9e25 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,12 +6,13 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
@@ -20,27 +21,50 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum));
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", uevent_helper);
+ return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -61,14 +85,21 @@ KERNEL_ATTR_RW(uevent_helper);
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", prof_on);
+ return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int ret;
+ static DEFINE_MUTEX(lock);
+ /*
+ * We need serialization, for profile_setup() initializes prof_on
+ * value and profile_init() must not reallocate prof_buffer after
+ * once allocated.
+ */
+ guard(mutex)(&lock);
if (prof_on)
return -EEXIST;
/*
@@ -88,61 +119,24 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC_CORE
-static ssize_t kexec_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", !!kexec_image);
-}
-KERNEL_ATTR_RO(kexec_loaded);
-
-static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", kexec_crash_loaded());
-}
-KERNEL_ATTR_RO(kexec_crash_loaded);
-
-static ssize_t kexec_crash_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
-}
-static ssize_t kexec_crash_size_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long cnt;
- int ret;
-
- if (kstrtoul(buf, 0, &cnt))
- return -EINVAL;
-
- ret = crash_shrink_memory(cnt);
- return ret < 0 ? ret : count;
-}
-KERNEL_ATTR_RW(kexec_crash_size);
-
-#endif /* CONFIG_KEXEC_CORE */
-
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
- return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)VMCOREINFO_NOTE_SIZE);
+ return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", file_caps_enabled);
+ return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
@@ -151,7 +145,7 @@ int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -168,7 +162,7 @@ int rcu_normal;
static ssize_t rcu_normal_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
}
static ssize_t rcu_normal_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -185,25 +179,11 @@ KERNEL_ATTR_RW(rcu_normal);
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
-extern const void __start_notes __weak;
-extern const void __stop_notes __weak;
+extern const void __start_notes;
+extern const void __stop_notes;
#define notes_size (&__stop_notes - &__start_notes)
-static ssize_t notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
-{
- memcpy(buf, &__start_notes + off, count);
- return count;
-}
-
-static struct bin_attribute notes_attr __ro_after_init = {
- .attr = {
- .name = "notes",
- .mode = S_IRUGO,
- },
- .read = &notes_read,
-};
+static __ro_after_init BIN_ATTR_SIMPLE_RO(notes);
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
@@ -211,18 +191,15 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC_CORE
- &kexec_loaded_attr.attr,
- &kexec_crash_loaded_attr.attr,
- &kexec_crash_size_attr.attr,
-#endif
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
#endif
#ifndef CONFIG_TINY_RCU
@@ -250,8 +227,9 @@ static int __init ksysfs_init(void)
goto kset_exit;
if (notes_size > 0) {
- notes_attr.size = notes_size;
- error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
+ bin_attr_notes.private = (void *)&__start_notes;
+ bin_attr_notes.size = notes_size;
+ error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes);
if (error)
goto group_exit;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 38c6dd822da8..99a3808d086f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,9 +35,13 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -52,10 +56,11 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ unsigned int node;
+ int started;
int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -63,6 +68,9 @@ struct kthread {
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
+ struct task_struct *task;
+ struct list_head hotplug_node;
+ struct cpumask *preferred_affinity;
};
enum KTHREAD_BITS {
@@ -80,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k)
/*
* Variant of to_kthread() that doesn't assume @p is a kthread.
*
- * Per construction; when:
- *
- * (p->flags & PF_KTHREAD) && p->worker_private
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread. For kthreads p->worker_private always
+ * points to a struct kthread. For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
*
- * the task is both a kthread and struct kthread is persistent. However
- * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
- * begin_new_exec()).
+ * Return NULL for any task that is not a kthread.
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
@@ -101,7 +108,7 @@ void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
struct kthread *kthread = to_kthread(tsk);
if (!kthread || !kthread->full_name) {
- __get_task_comm(buf, buf_size, tsk);
+ strscpy(buf, tsk->comm, buf_size);
return;
}
@@ -121,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
+ INIT_LIST_HEAD(&kthread->hotplug_node);
p->vfork_done = &kthread->exited;
+ kthread->task = p;
+ kthread->node = tsk_fork_get_node(current);
p->worker_private = kthread;
return true;
}
@@ -159,11 +169,10 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-bool __kthread_should_park(struct task_struct *k)
+static bool __kthread_should_park(struct task_struct *k)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
-EXPORT_SYMBOL_GPL(__kthread_should_park);
/**
* kthread_should_park - should this kthread park now?
@@ -182,6 +191,16 @@ bool kthread_should_park(void)
}
EXPORT_SYMBOL_GPL(kthread_should_park);
+bool kthread_should_stop_or_park(void)
+{
+ struct kthread *kthread = __to_kthread(current);
+
+ if (!kthread)
+ return false;
+
+ return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
+}
+
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
@@ -304,18 +323,29 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
+ if (!list_empty(&kthread->hotplug_node)) {
+ mutex_lock(&kthreads_hotplug_lock);
+ list_del(&kthread->hotplug_node);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ if (kthread->preferred_affinity) {
+ kfree(kthread->preferred_affinity);
+ kthread->preferred_affinity = NULL;
+ }
+ }
do_exit(0);
}
+EXPORT_SYMBOL(kthread_exit);
/**
* kthread_complete_and_exit - Exit the current kthread.
* @comp: Completion to complete
* @code: The integer value to return to kthread_stop().
*
- * If present complete @comp and the reuturn code to kthread_stop().
+ * If present, complete @comp and then return code to kthread_stop().
*
* A kernel thread whose module may be removed after the completion of
- * @comp can use this function exit safely.
+ * @comp can use this function to exit safely.
*
* Does not return.
*/
@@ -328,6 +358,56 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
}
EXPORT_SYMBOL(kthread_complete_and_exit);
+static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
+{
+ const struct cpumask *pref;
+
+ if (kthread->preferred_affinity) {
+ pref = kthread->preferred_affinity;
+ } else {
+ if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
+ return;
+ pref = cpumask_of_node(kthread->node);
+ }
+
+ cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ if (cpumask_empty(cpumask))
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+static void kthread_affine_node(void)
+{
+ struct kthread *kthread = to_kthread(current);
+ cpumask_var_t affinity;
+
+ WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+ if (kthread->node == NUMA_NO_NODE) {
+ housekeeping_affine(current, HK_TYPE_KTHREAD);
+ } else {
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ free_cpumask_var(affinity);
+ }
+}
+
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@@ -341,13 +421,15 @@ static int kthread(void *_create)
self = to_kthread(current);
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
@@ -356,7 +438,6 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -370,6 +451,11 @@ static int kthread(void *_create)
schedule_preempt_disabled();
preempt_enable();
+ self->started = 1;
+
+ if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
+ kthread_affine_node();
+
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
@@ -397,11 +483,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -428,6 +516,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -441,9 +534,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * If I was killed by a fatal signal before kthreadd (or new
+ * kernel thread) calls complete(), leave the cleanup of this
+ * structure to that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
@@ -454,26 +547,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- char name[TASK_COMM_LEN];
- va_list aq;
- int len;
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- va_copy(aq, args);
- len = vsnprintf(name, sizeof(name), namefmt, aq);
- va_end(aq);
- if (len >= TASK_COMM_LEN) {
- struct kthread *kthread = to_kthread(task);
-
- /* leave it truncated when out of memory. */
- kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
- }
- set_task_comm(task, name);
- }
+free_create:
kfree(create);
return task;
}
@@ -519,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -540,7 +612,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int
void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
+ struct kthread *kthread = to_kthread(p);
__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
}
/**
@@ -554,7 +628,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
+ struct kthread *kthread = to_kthread(p);
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);
@@ -623,6 +699,8 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
+ return;
/*
* Newly created kthread was parked when the CPU was offline.
* The binding was lost and we need to set it again.
@@ -705,6 +783,7 @@ int kthread_stop(struct task_struct *k)
kthread = to_kthread(k);
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
kthread_unpark(k);
+ set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
wake_up_process(k);
wait_for_completion(&kthread->exited);
ret = kthread->result;
@@ -715,14 +794,33 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
+ static const char comm[TASK_COMM_LEN] = "kthreadd";
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
- set_task_comm(tsk, "kthreadd");
+ set_task_comm(tsk, comm);
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -753,6 +851,90 @@ int kthreadd(void *unused)
return 0;
}
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+{
+ struct kthread *kthread = to_kthread(p);
+ cpumask_var_t affinity;
+ int ret = 0;
+
+ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ WARN_ON_ONCE(kthread->preferred_affinity);
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+ if (!kthread->preferred_affinity) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ cpumask_copy(kthread->preferred_affinity, mask);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ kthread_fetch_affinity(kthread, affinity);
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, affinity);
+
+ mutex_unlock(&kthreads_hotplug_lock);
+out:
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ cpumask_var_t affinity;
+ struct kthread *k;
+ int ret;
+
+ guard(mutex)(&kthreads_hotplug_lock);
+
+ if (list_empty(&kthreads_hotplug))
+ return 0;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = 0;
+
+ list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+ kthread_is_per_cpu(k->task))) {
+ ret = -EINVAL;
+ continue;
+ }
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
+
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+
+static int kthreads_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+ kthreads_online_cpu, NULL);
+}
+early_initcall(kthreads_init);
+
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
@@ -826,8 +1008,16 @@ repeat:
* event only cares about the address.
*/
trace_sched_kthread_work_execute_end(work, func);
- } else if (!freezing(current))
+ } else if (!freezing(current)) {
schedule();
+ } else {
+ /*
+ * Handle the case where the current remains
+ * TASK_INTERRUPTIBLE. try_to_freeze() expects
+ * the current to be TASK_RUNNING.
+ */
+ __set_current_state(TASK_RUNNING);
+ }
try_to_freeze();
cond_resched();
@@ -836,12 +1026,11 @@ repeat:
EXPORT_SYMBOL_GPL(kthread_worker_fn);
static __printf(3, 0) struct kthread_worker *
-__kthread_create_worker(int cpu, unsigned int flags,
- const char namefmt[], va_list args)
+__kthread_create_worker_on_node(unsigned int flags, int node,
+ const char namefmt[], va_list args)
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
@@ -849,20 +1038,14 @@ __kthread_create_worker(int cpu, unsigned int flags,
kthread_init_worker(worker);
- if (cpu >= 0)
- node = cpu_to_node(cpu);
-
task = __kthread_create_on_node(kthread_worker_fn, worker,
- node, namefmt, args);
+ node, namefmt, args);
if (IS_ERR(task))
goto fail_task;
- if (cpu >= 0)
- kthread_bind(task, cpu);
-
worker->flags = flags;
worker->task = task;
- wake_up_process(task);
+
return worker;
fail_task:
@@ -871,34 +1054,36 @@ fail_task:
}
/**
- * kthread_create_worker - create a kthread worker
+ * kthread_create_worker_on_node - create a kthread worker
* @flags: flags modifying the default behavior of the worker
+ * @node: task structure for the thread is allocated on this node
* @namefmt: printf-style name for the kthread worker (task).
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
{
struct kthread_worker *worker;
va_list args;
va_start(args, namefmt);
- worker = __kthread_create_worker(-1, flags, namefmt, args);
+ worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
va_end(args);
return worker;
}
-EXPORT_SYMBOL(kthread_create_worker);
+EXPORT_SYMBOL(kthread_create_worker_on_node);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
* to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
- * @namefmt: printf-style name for the kthread worker (task).
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
*
* Use a valid CPU number if you want to bind the kthread worker
* to the given CPU and the associated NUMA node.
@@ -926,18 +1111,17 @@ EXPORT_SYMBOL(kthread_create_worker);
* Return:
* The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
- const char namefmt[], ...)
+ const char namefmt[])
{
struct kthread_worker *worker;
- va_list args;
- va_start(args, namefmt);
- worker = __kthread_create_worker(cpu, flags, namefmt, args);
- va_end(args);
+ worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
+ if (!IS_ERR(worker))
+ kthread_bind(worker->task, cpu);
return worker;
}
@@ -986,7 +1170,7 @@ static void kthread_insert_work(struct kthread_worker *worker,
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
- * must have been created with kthread_worker_create(). Returns %true
+ * must have been created with kthread_create_worker(). Returns %true
* if @work was successfully queued, %false if it was already pending.
*
* Reinitialize the work if it needs to be used by another worker.
@@ -1018,7 +1202,8 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
*/
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
- struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
+ struct kthread_delayed_work *dwork = timer_container_of(dwork, t,
+ timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
unsigned long flags;
@@ -1051,8 +1236,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_FUNCTION_MISMATCH(timer->function,
- kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
@@ -1174,14 +1358,14 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
struct kthread_worker *worker = work->worker;
/*
- * del_timer_sync() must be called to make sure that the timer
+ * timer_delete_sync() must be called to make sure that the timer
* callback is not running. The lock must be temporary released
* to avoid a deadlock with the callback. In the meantime,
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
raw_spin_unlock_irqrestore(&worker->lock, *flags);
- del_timer_sync(&dwork->timer);
+ timer_delete_sync(&dwork->timer);
raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
@@ -1383,6 +1567,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
@@ -1394,6 +1582,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kthread_flush_worker(worker);
kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
@@ -1411,14 +1600,18 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
+ tsk->active_mm = mm;
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1435,14 +1628,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
-
- to_kthread(tsk)->oldfs = force_uaccess_begin();
+ mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1457,8 +1645,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- force_uaccess_end(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
@@ -1468,14 +1654,16 @@ void kthread_unuse_mm(struct mm_struct *mm)
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
@@ -1527,5 +1715,4 @@ struct cgroup_subsys_state *kthread_blkcg(void)
}
return NULL;
}
-EXPORT_SYMBOL(kthread_blkcg);
#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 166d7bf49666..d4281d1e13a6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
+#include <linux/sysctl.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
@@ -63,6 +64,30 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
+
+static const struct ctl_table latencytop_sysctl[] = {
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_latencytop,
+ },
+};
+#endif
+
void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
@@ -86,7 +111,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -124,7 +149,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
@@ -133,9 +158,9 @@ account_global_scheduler_latency(struct task_struct *tsk,
/**
* __account_scheduler_latency - record an occurred latency
- * @tsk - the task struct of the task hitting the latency
- * @usecs - the duration of the latency in microseconds
- * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ * @tsk: the task struct of the task hitting the latency
+ * @usecs: the duration of the latency in microseconds
+ * @inter: 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
@@ -266,18 +291,9 @@ static const struct proc_ops lstats_proc_ops = {
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("kernel", latencytop_sysctl);
+#endif
return 0;
}
-
-int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int err;
-
- err = proc_dointvec(table, write, buffer, lenp, ppos);
- if (latencytop_enabled)
- force_schedstat_enabled();
-
- return err;
-}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 53d51ed619a3..4c0a9c18d0b2 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -18,3 +18,15 @@ config LIVEPATCH
module uses the interface provided by this option to register
a patch, causing calls to patched functions to be redirected
to new function code contained in the patch module.
+
+config HAVE_KLP_BUILD
+ bool
+ help
+ Arch supports klp-build
+
+config KLP_BUILD
+ def_bool y
+ depends on LIVEPATCH && HAVE_KLP_BUILD
+ select OBJTOOL
+ help
+ Enable klp-build support
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 585494ec464f..9917756dae46 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
@@ -58,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- rcu_read_lock_sched();
+ guard(rcu)();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
@@ -74,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj)
*/
if (mod && mod->klp_alive)
obj->mod = mod;
-
- rcu_read_unlock_sched();
}
static bool klp_initialized(void)
@@ -89,8 +88,14 @@ static struct klp_func *klp_find_func(struct klp_object *obj,
struct klp_func *func;
klp_for_each_func(obj, func) {
+ /*
+ * Besides identical old_sympos, also consider old_sympos
+ * of 0 and 1 are identical.
+ */
if ((strcmp(old_func->old_name, func->old_name) == 0) &&
- (old_func->old_sympos == func->old_sympos)) {
+ ((old_func->old_sympos == func->old_sympos) ||
+ (old_func->old_sympos == 0 && func->old_sympos == 1) ||
+ (old_func->old_sympos == 1 && func->old_sympos == 0))) {
return func;
}
}
@@ -118,27 +123,16 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
unsigned long count;
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -153,11 +147,20 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
.count = 0,
@@ -165,9 +168,9 @@ static int klp_find_object_symbol(const char *objname, const char *name,
};
if (objname)
- module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
@@ -190,7 +193,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symndx, Elf_Shdr *relasec,
const char *sec_objname)
{
@@ -213,21 +216,21 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* we use the smallest/strictest upper bound possible (56, based on
* the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
- pr_err("symbol %s is not marked as a livepatch symbol\n",
- strtab + sym->st_name);
+ pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n",
+ strtab + sym->st_name, symndx, i);
return -EINVAL;
}
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%127[^,],%lu",
+ KLP_SYM_PREFIX "%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
@@ -244,7 +247,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* symbols are exported and normal relas can be used instead.
*/
if (!sec_vmlinux && sym_vmlinux) {
- pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
sym_name);
return -EINVAL;
}
@@ -261,6 +264,14 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
return 0;
}
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+}
+
/*
* At a high-level, there are two types of klp relocation sections: those which
* reference symbols which live in vmlinux; and those which reference symbols
@@ -284,10 +295,10 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* the to-be-patched module to be loaded and patched sometime *after* the
* klp module is loaded.
*/
-int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
- const char *shstrtab, const char *strtab,
- unsigned int symndx, unsigned int secndx,
- const char *objname)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
@@ -298,7 +309,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* See comment in klp_resolve_symbols() for an explanation
* of the selected field width value.
*/
- cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]",
+ cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]",
sec_objname);
if (cnt != 1) {
pr_err("section %s has an incorrectly formatted name\n",
@@ -309,11 +320,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
if (strcmp(objname ? objname : "vmlinux", sec_objname))
return 0;
- ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
- if (ret)
- return ret;
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
+ return ret;
+
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ }
- return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return 0;
+}
+
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
+{
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
/*
@@ -324,7 +350,10 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
+ * /sys/kernel/livepatch/<patch>/replace
+ * /sys/kernel/livepatch/<patch>/stack_order
* /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static int __klp_disable_patch(struct klp_patch *patch);
@@ -378,7 +407,7 @@ static ssize_t enabled_show(struct kobject *kobj,
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
+ return sysfs_emit(buf, "%d\n", patch->enabled);
}
static ssize_t transition_show(struct kobject *kobj,
@@ -387,8 +416,7 @@ static ssize_t transition_show(struct kobject *kobj,
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n",
- patch == klp_transition_patch);
+ return sysfs_emit(buf, "%d\n", patch == klp_transition_patch);
}
static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -420,17 +448,67 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
return count;
}
+static ssize_t replace_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return sysfs_emit(buf, "%d\n", patch->replace);
+}
+
+static ssize_t stack_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch, *this_patch;
+ int stack_order = 0;
+
+ this_patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ klp_for_each_patch(patch) {
+ stack_order++;
+ if (patch == this_patch)
+ break;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return sysfs_emit(buf, "%d\n", stack_order);
+}
+
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
+static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace);
+static struct kobj_attribute stack_order_kobj_attr = __ATTR_RO(stack_order);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
&force_kobj_attr.attr,
+ &replace_kobj_attr.attr,
+ &stack_order_kobj_attr.attr,
NULL
};
ATTRIBUTE_GROUPS(klp_patch);
+static ssize_t patched_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+ return sysfs_emit(buf, "%d\n", obj->patched);
+}
+
+static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
+static struct attribute *klp_object_attrs[] = {
+ &patched_kobj_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(klp_object);
+
static void klp_free_object_dynamic(struct klp_object *obj)
{
kfree(obj->name);
@@ -527,9 +605,12 @@ static int klp_add_object_nops(struct klp_patch *patch,
}
/*
- * Add 'nop' functions which simply return to the caller to run
- * the original function. The 'nop' functions are added to a
- * patch to facilitate a 'replace' mode.
+ * Add 'nop' functions which simply return to the caller to run the
+ * original function.
+ *
+ * They are added only when the atomic replace mode is used and only for
+ * functions which are currently livepatched but are no longer included
+ * in the new livepatch.
*/
static int klp_add_nops(struct klp_patch *patch)
{
@@ -557,7 +638,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -573,9 +654,10 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
@@ -588,7 +670,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
@@ -744,8 +826,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-static int klp_apply_object_relocs(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
{
int i, ret;
struct klp_modinfo *info = patch->mod->klp_info;
@@ -756,10 +839,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
continue;
- ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
info->secstrings,
patch->mod->core_kallsyms.strtab,
- info->symndx, i, obj->name);
+ info->symndx, i, obj->name, apply);
if (ret)
return ret;
}
@@ -767,6 +850,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
return 0;
}
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
+
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -920,7 +1015,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
if (klp_transition_patch)
return -EBUSY;
- klp_init_transition(patch, KLP_UNPATCHED);
+ klp_init_transition(patch, KLP_TRANSITION_UNPATCHED);
klp_for_each_object(patch, obj)
if (obj->patched)
@@ -955,7 +1050,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
pr_notice("enabling patch '%s'\n", patch->mod->name);
- klp_init_transition(patch, KLP_PATCHED);
+ klp_init_transition(patch, KLP_TRANSITION_PATCHED);
/*
* Enforce the order of the func->transition writes in
@@ -1154,7 +1249,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
klp_unpatch_object(obj);
klp_post_unpatch_callback(obj);
-
+ klp_clear_object_relocs(patch, obj);
klp_free_object_loaded(obj);
break;
}
@@ -1171,7 +1266,7 @@ int klp_module_coming(struct module *mod)
return -EINVAL;
if (!strcmp(mod->name, "vmlinux")) {
- pr_err("vmlinux.ko: invalid module name");
+ pr_err("vmlinux.ko: invalid module name\n");
return -EINVAL;
}
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index fe316c021d73..90408500e5a3 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -95,9 +95,9 @@ static void notrace klp_ftrace_handler(unsigned long ip,
patch_state = current->patch_state;
- WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
+ WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE);
- if (patch_state == KLP_UNPATCHED) {
+ if (patch_state == KLP_TRANSITION_UNPATCHED) {
/*
* Use the previously patched version of the function.
* If no previous patches exist, continue with the
@@ -118,25 +118,12 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- klp_arch_set_pc(fregs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
ftrace_test_recursion_unlock(bit);
}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
static void klp_unpatch_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..2351a19ac2a9 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,23 +9,35 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
-#include <linux/tracehook.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
+static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
struct klp_patch *klp_transition_patch;
-static int klp_target_state = KLP_UNDEFINED;
+static int klp_target_state = KLP_TRANSITION_IDLE;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * schedule(). This helps CPU-bound kthreads get patched.
+ */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -75,16 +87,16 @@ static void klp_complete_transition(void)
pr_debug("'%s': completing %s transition\n",
klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
- if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
+ if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) {
klp_unpatch_replaced_patches(klp_transition_patch);
klp_discard_nops(klp_transition_patch);
}
- if (klp_target_state == KLP_UNPATCHED) {
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
/*
- * All tasks have transitioned to KLP_UNPATCHED so we can now
+ * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now
* remove the new functions from the func_stack.
*/
klp_unpatch_objects(klp_transition_patch);
@@ -102,36 +114,36 @@ static void klp_complete_transition(void)
klp_for_each_func(obj, func)
func->transition = false;
- /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
- if (klp_target_state == KLP_PATCHED)
+ /* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */
+ if (klp_target_state == KLP_TRANSITION_PATCHED)
klp_synchronize_transition();
read_lock(&tasklist_lock);
for_each_process_thread(g, task) {
WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
- task->patch_state = KLP_UNDEFINED;
+ task->patch_state = KLP_TRANSITION_IDLE;
}
read_unlock(&tasklist_lock);
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
- task->patch_state = KLP_UNDEFINED;
+ task->patch_state = KLP_TRANSITION_IDLE;
}
klp_for_each_object(klp_transition_patch, obj) {
if (!klp_is_object_loaded(obj))
continue;
- if (klp_target_state == KLP_PATCHED)
+ if (klp_target_state == KLP_TRANSITION_PATCHED)
klp_post_patch_callback(obj);
- else if (klp_target_state == KLP_UNPATCHED)
+ else if (klp_target_state == KLP_TRANSITION_UNPATCHED)
klp_post_unpatch_callback(obj);
}
pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
- klp_target_state = KLP_UNDEFINED;
+ klp_target_state = KLP_TRANSITION_IDLE;
klp_transition_patch = NULL;
}
@@ -143,13 +155,13 @@ static void klp_complete_transition(void)
*/
void klp_cancel_transition(void)
{
- if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
+ if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED))
return;
pr_debug("'%s': canceling patching transition, going to unpatch\n",
klp_transition_patch->mod->name);
- klp_target_state = KLP_UNPATCHED;
+ klp_target_state = KLP_TRANSITION_UNPATCHED;
klp_complete_transition();
}
@@ -173,8 +185,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -197,36 +209,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
@@ -241,12 +253,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
@@ -308,7 +323,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
switch (ret) {
case 0: /* success */
break;
@@ -335,6 +354,32 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
+void __klp_sched_try_switch(void)
+{
+ /*
+ * This function is called from __schedule() while a context switch is
+ * about to happen. Preemption is already disabled and klp_mutex
+ * can't be acquired.
+ * Disabled preemption is used to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch to this task while it's running.
+ */
+ lockdep_assert_preemption_disabled();
+
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+}
+
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -389,7 +434,7 @@ void klp_try_complete_transition(void)
struct klp_patch *patch;
bool complete = true;
- WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+ WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
/*
* Try to switch the tasks to the target patch state by walking their
@@ -441,7 +486,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -465,11 +511,11 @@ void klp_start_transition(void)
struct task_struct *g, *task;
unsigned int cpu;
- WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+ WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
pr_notice("'%s': starting %s transition\n",
klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
/*
* Mark all normal tasks as needing a patch state update. They'll
@@ -493,6 +539,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -509,7 +557,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
struct klp_func *func;
int initial_state = !state;
- WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
+ WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE);
klp_transition_patch = patch;
@@ -520,7 +568,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
klp_target_state = state;
pr_debug("'%s': initializing %s transition\n", patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
/*
* Initialize all tasks to the initial patch state to prepare them for
@@ -528,7 +576,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task) {
- WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+ WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
task->patch_state = initial_state;
}
read_unlock(&tasklist_lock);
@@ -538,18 +586,19 @@ void klp_init_transition(struct klp_patch *patch, int state)
*/
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
- WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+ WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
task->patch_state = initial_state;
}
/*
* Enforce the order of the task->patch_state initializations and the
* func->transition updates to ensure that klp_ftrace_handler() doesn't
- * see a func in transition with a task->patch_state of KLP_UNDEFINED.
+ * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_TRANSITION_IDLE.
*/
smp_wmb();
@@ -582,17 +631,13 @@ void klp_reverse_transition(void)
pr_debug("'%s': reversing transition from %s\n",
klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -602,18 +647,51 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
/* Called from copy_process() during fork */
void klp_copy_process(struct task_struct *child)
{
- child->patch_state = current->patch_state;
- /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+ /*
+ * The parent process may have gone through a KLP transition since
+ * the thread flag was copied in setup_thread_stack earlier. Bring
+ * the task flag up to date with the parent here.
+ *
+ * The operation is serialized against all klp_*_transition()
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
+ */
+ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
+ set_tsk_thread_flag(child, TIF_PATCH_PENDING);
+ else
+ clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
+
+ child->patch_state = current->patch_state;
}
/*
@@ -641,6 +719,13 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_for_each_patch(patch)
- patch->forced = true;
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
}
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
new file mode 100644
index 000000000000..9b2515f31afb
--- /dev/null
+++ b/kernel/liveupdate/Kconfig
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Live Update and Kexec HandOver"
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
+config KEXEC_HANDOVER_DEBUGFS
+ bool "kexec handover debugfs interface"
+ default KEXEC_HANDOVER
+ depends on KEXEC_HANDOVER
+ select DEBUG_FS
+ help
+ Allow to control kexec handover device tree via debugfs
+ interface, i.e. finalize the state or aborting the finalization.
+ Also, enables inspecting the KHO fdt trees with the debugfs binary
+ blobs.
+
+config KEXEC_HANDOVER_ENABLE_DEFAULT
+ bool "Enable kexec handover by default"
+ depends on KEXEC_HANDOVER
+ help
+ Enable Kexec Handover by default. This avoids the need to
+ explicitly pass 'kho=on' on the kernel command line.
+
+ This is useful for systems where KHO is a prerequisite for other
+ features, such as Live Update, ensuring the mechanism is always
+ active.
+
+ The default behavior can still be overridden at boot time by
+ passing 'kho=off'.
+
+config LIVEUPDATE
+ bool "Live Update Orchestrator"
+ depends on KEXEC_HANDOVER
+ help
+ Enable the Live Update Orchestrator. Live Update is a mechanism,
+ typically based on kexec, that allows the kernel to be updated
+ while keeping selected devices operational across the transition.
+ These devices are intended to be reclaimed by the new kernel and
+ re-attached to their original workload without requiring a device
+ reset.
+
+ Ability to handover a device from current to the next kernel depends
+ on specific support within device drivers and related kernel
+ subsystems.
+
+ This feature primarily targets virtual machine hosts to quickly update
+ the kernel hypervisor with minimal disruption to the running virtual
+ machines.
+
+ If unsure, say N.
+
+endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
new file mode 100644
index 000000000000..7cad2eece32d
--- /dev/null
+++ b/kernel/liveupdate/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+luo-y := \
+ luo_core.o \
+ luo_file.o \
+ luo_session.o
+
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE) += luo.o
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
new file mode 100644
index 000000000000..9dc51fab604f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover.c
@@ -0,0 +1,1594 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover.c - kexec handover metadata processing
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/cleanup.h>
+#include <linux/cma.h>
+#include <linux/kmemleak.h>
+#include <linux/count_zeros.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/page-isolation.h>
+#include <linux/unaligned.h>
+#include <linux/vmalloc.h>
+
+#include <asm/early_ioremap.h>
+
+#include "kexec_handover_internal.h"
+/*
+ * KHO is tightly coupled with mm init and needs access to some of mm
+ * internal APIs.
+ */
+#include "../../mm/internal.h"
+#include "../kexec_internal.h"
+#include "kexec_handover_internal.h"
+
+#define KHO_FDT_COMPATIBLE "kho-v1"
+#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
+#define PROP_SUB_FDT "fdt"
+
+#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
+
+/*
+ * KHO uses page->private, which is an unsigned long, to store page metadata.
+ * Use it to store both the magic and the order.
+ */
+union kho_page_info {
+ unsigned long page_private;
+ struct {
+ unsigned int order;
+ unsigned int magic;
+ };
+};
+
+static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
+
+static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
+
+bool kho_is_enabled(void)
+{
+ return kho_enable;
+}
+EXPORT_SYMBOL_GPL(kho_is_enabled);
+
+static int __init kho_parse_enable(char *p)
+{
+ return kstrtobool(p, &kho_enable);
+}
+early_param("kho", kho_parse_enable);
+
+/*
+ * Keep track of memory that is to be preserved across KHO.
+ *
+ * The serializing side uses two levels of xarrays to manage chunks of per-order
+ * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
+ * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
+ * allocations each bitmap will cover 128M of address space. Thus, for 16G of
+ * memory at most 512K of bitmap memory will be needed for order 0.
+ *
+ * This approach is fully incremental, as the serialization progresses folios
+ * can continue be aggregated to the tracker. The final step, immediately prior
+ * to kexec would serialize the xarray information into a linked list for the
+ * successor kernel to parse.
+ */
+
+#define PRESERVE_BITS (PAGE_SIZE * 8)
+
+struct kho_mem_phys_bits {
+ DECLARE_BITMAP(preserve, PRESERVE_BITS);
+};
+
+static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
+
+struct kho_mem_phys {
+ /*
+ * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
+ * to order.
+ */
+ struct xarray phys_bits;
+};
+
+struct kho_mem_track {
+ /* Points to kho_mem_phys, each order gets its own bitmap tree */
+ struct xarray orders;
+};
+
+struct khoser_mem_chunk;
+
+struct kho_out {
+ void *fdt;
+ bool finalized;
+ struct mutex lock; /* protects KHO FDT finalization */
+
+ struct kho_mem_track track;
+ struct kho_debugfs dbg;
+};
+
+static struct kho_out kho_out = {
+ .lock = __MUTEX_INITIALIZER(kho_out.lock),
+ .track = {
+ .orders = XARRAY_INIT(kho_out.track.orders, 0),
+ },
+ .finalized = false,
+};
+
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
+{
+ void *res = xa_load(xa, index);
+
+ if (res)
+ return res;
+
+ void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
+
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
+ res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+ if (xa_is_err(res))
+ return ERR_PTR(xa_err(res));
+ else if (res)
+ return res;
+
+ return no_free_ptr(elm);
+}
+
+static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ physxa = xa_load(&track->orders, order);
+ if (WARN_ON_ONCE(!physxa))
+ return;
+
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (WARN_ON_ONCE(!bits))
+ return;
+
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+}
+
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ __kho_unpreserve_order(track, pfn, order);
+
+ pfn += 1 << order;
+ }
+}
+
+static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa, *new_physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ might_sleep();
+ physxa = xa_load(&track->orders, order);
+ if (!physxa) {
+ int err;
+
+ new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
+ if (!new_physxa)
+ return -ENOMEM;
+
+ xa_init(&new_physxa->phys_bits);
+ physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
+ GFP_KERNEL);
+
+ err = xa_err(physxa);
+ if (err || physxa) {
+ xa_destroy(&new_physxa->phys_bits);
+ kfree(new_physxa);
+
+ if (err)
+ return err;
+ } else {
+ physxa = new_physxa;
+ }
+ }
+
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (IS_ERR(bits))
+ return PTR_ERR(bits);
+
+ set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ return 0;
+}
+
+static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
+{
+ struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned int nr_pages, ref_cnt;
+ union kho_page_info info;
+
+ if (!page)
+ return NULL;
+
+ info.page_private = page->private;
+ /*
+ * deserialize_bitmap() only sets the magic on the head page. This magic
+ * check also implicitly makes sure phys is order-aligned since for
+ * non-order-aligned phys addresses, magic will never be set.
+ */
+ if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER))
+ return NULL;
+ nr_pages = (1 << info.order);
+
+ /* Clear private to make sure later restores on this page error out. */
+ page->private = 0;
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /*
+ * For higher order folios, tail pages get a page count of zero.
+ * For physically contiguous order-0 pages every pages gets a page
+ * count of 1
+ */
+ ref_cnt = is_folio ? 0 : 1;
+ for (unsigned int i = 1; i < nr_pages; i++)
+ set_page_count(page + i, ref_cnt);
+
+ if (is_folio && info.order)
+ prep_compound_page(page, info.order);
+
+ adjust_managed_page_count(page, nr_pages);
+ return page;
+}
+
+/**
+ * kho_restore_folio - recreates the folio from the preserved memory.
+ * @phys: physical address of the folio.
+ *
+ * Return: pointer to the struct folio on success, NULL on failure.
+ */
+struct folio *kho_restore_folio(phys_addr_t phys)
+{
+ struct page *page = kho_restore_page(phys, true);
+
+ return page ? page_folio(page) : NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_folio);
+
+/**
+ * kho_restore_pages - restore list of contiguous order 0 pages.
+ * @phys: physical address of the first page.
+ * @nr_pages: number of pages.
+ *
+ * Restore a contiguous list of order 0 pages that was preserved with
+ * kho_preserve_pages().
+ *
+ * Return: 0 on success, error code on failure
+ */
+struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
+{
+ const unsigned long start_pfn = PHYS_PFN(phys);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn = start_pfn;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+ struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
+
+ if (!page)
+ return NULL;
+ pfn += 1 << order;
+ }
+
+ return pfn_to_page(start_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_restore_pages);
+
+/* Serialize and deserialize struct kho_mem_phys across kexec
+ *
+ * Record all the bitmaps in a linked list of pages for the next kernel to
+ * process. Each chunk holds bitmaps of the same order and each block of bitmaps
+ * starts at a given physical address. This allows the bitmaps to be sparse. The
+ * xarray is used to store them in a tree while building up the data structure,
+ * but the KHO successor kernel only needs to process them once in order.
+ *
+ * All of this memory is normal kmalloc() memory and is not marked for
+ * preservation. The successor kernel will remain isolated to the scratch space
+ * until it completes processing this list. Once processed all the memory
+ * storing these ranges will be marked as free.
+ */
+
+struct khoser_mem_bitmap_ptr {
+ phys_addr_t phys_start;
+ DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
+};
+
+struct khoser_mem_chunk_hdr {
+ DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
+ unsigned int order;
+ unsigned int num_elms;
+};
+
+#define KHOSER_BITMAP_SIZE \
+ ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
+ sizeof(struct khoser_mem_bitmap_ptr))
+
+struct khoser_mem_chunk {
+ struct khoser_mem_chunk_hdr hdr;
+ struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
+};
+
+static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
+
+static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
+ unsigned long order)
+{
+ struct khoser_mem_chunk *chunk __free(free_page) = NULL;
+
+ chunk = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!chunk)
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
+ chunk->hdr.order = order;
+ if (cur_chunk)
+ KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
+ return no_free_ptr(chunk);
+}
+
+static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+{
+ struct khoser_mem_chunk *chunk = first_chunk;
+
+ while (chunk) {
+ struct khoser_mem_chunk *tmp = chunk;
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+
+/*
+ * Update memory map property, if old one is found discard it via
+ * kho_mem_ser_free().
+ */
+static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
+{
+ void *ptr;
+ u64 phys;
+
+ ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+
+ /* Check and discard previous memory map */
+ phys = get_unaligned((u64 *)ptr);
+ if (phys)
+ kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
+
+ /* Update with the new value */
+ phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
+ put_unaligned(phys, (u64 *)ptr);
+}
+
+static int kho_mem_serialize(struct kho_out *kho_out)
+{
+ struct khoser_mem_chunk *first_chunk = NULL;
+ struct khoser_mem_chunk *chunk = NULL;
+ struct kho_mem_phys *physxa;
+ unsigned long order;
+ int err = -ENOMEM;
+
+ xa_for_each(&kho_out->track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ chunk = new_chunk(chunk, order);
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
+ goto err_free;
+ }
+
+ if (!first_chunk)
+ first_chunk = chunk;
+
+ xa_for_each(&physxa->phys_bits, phys, bits) {
+ struct khoser_mem_bitmap_ptr *elm;
+
+ if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
+ chunk = new_chunk(chunk, order);
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
+ goto err_free;
+ }
+ }
+
+ elm = &chunk->bitmaps[chunk->hdr.num_elms];
+ chunk->hdr.num_elms++;
+ elm->phys_start = (phys * PRESERVE_BITS)
+ << (order + PAGE_SHIFT);
+ KHOSER_STORE_PTR(elm->bitmap, bits);
+ }
+ }
+
+ kho_update_memory_map(first_chunk);
+
+ return 0;
+
+err_free:
+ kho_mem_ser_free(first_chunk);
+ return err;
+}
+
+static void __init deserialize_bitmap(unsigned int order,
+ struct khoser_mem_bitmap_ptr *elm)
+{
+ struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
+ unsigned long bit;
+
+ for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
+ int sz = 1 << (order + PAGE_SHIFT);
+ phys_addr_t phys =
+ elm->phys_start + (bit << (order + PAGE_SHIFT));
+ struct page *page = phys_to_page(phys);
+ union kho_page_info info;
+
+ memblock_reserve(phys, sz);
+ memblock_reserved_mark_noinit(phys, sz);
+ info.magic = KHO_PAGE_MAGIC;
+ info.order = order;
+ page->private = info.page_private;
+ }
+}
+
+/* Return true if memory was deserizlied */
+static bool __init kho_mem_deserialize(const void *fdt)
+{
+ struct khoser_mem_chunk *chunk;
+ const void *mem_ptr;
+ u64 mem;
+ int len;
+
+ mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ if (!mem_ptr || len != sizeof(u64)) {
+ pr_err("failed to get preserved memory bitmaps\n");
+ return false;
+ }
+
+ mem = get_unaligned((const u64 *)mem_ptr);
+ chunk = mem ? phys_to_virt(mem) : NULL;
+
+ /* No preserved physical pages were passed, no deserialization */
+ if (!chunk)
+ return false;
+
+ while (chunk) {
+ unsigned int i;
+
+ for (i = 0; i != chunk->hdr.num_elms; i++)
+ deserialize_bitmap(chunk->hdr.order,
+ &chunk->bitmaps[i]);
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ }
+
+ return true;
+}
+
+/*
+ * With KHO enabled, memory can become fragmented because KHO regions may
+ * be anywhere in physical address space. The scratch regions give us a
+ * safe zones that we will never see KHO allocations from. This is where we
+ * can later safely load our new kexec images into and then use the scratch
+ * area for early allocations that happen before page allocator is
+ * initialized.
+ */
+struct kho_scratch *kho_scratch;
+unsigned int kho_scratch_cnt;
+
+/*
+ * The scratch areas are scaled by default as percent of memory allocated from
+ * memblock. A user can override the scale with command line parameter:
+ *
+ * kho_scratch=N%
+ *
+ * It is also possible to explicitly define size for a lowmem, a global and
+ * per-node scratch areas:
+ *
+ * kho_scratch=l[KMG],n[KMG],m[KMG]
+ *
+ * The explicit size definition takes precedence over scale definition.
+ */
+static unsigned int scratch_scale __initdata = 200;
+static phys_addr_t scratch_size_global __initdata;
+static phys_addr_t scratch_size_pernode __initdata;
+static phys_addr_t scratch_size_lowmem __initdata;
+
+static int __init kho_parse_scratch_size(char *p)
+{
+ size_t len;
+ unsigned long sizes[3];
+ size_t total_size = 0;
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ len = strlen(p);
+ if (!len)
+ return -EINVAL;
+
+ /* parse nn% */
+ if (p[len - 1] == '%') {
+ /* unsigned int max is 4,294,967,295, 10 chars */
+ char s_scale[11] = {};
+ int ret = 0;
+
+ if (len > ARRAY_SIZE(s_scale))
+ return -EINVAL;
+
+ memcpy(s_scale, p, len - 1);
+ ret = kstrtouint(s_scale, 10, &scratch_scale);
+ if (!ret)
+ pr_notice("scratch scale is %d%%\n", scratch_scale);
+ return ret;
+ }
+
+ /* parse ll[KMG],mm[KMG],nn[KMG] */
+ for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+ char *endp = p;
+
+ if (i > 0) {
+ if (*p != ',')
+ return -EINVAL;
+ p += 1;
+ }
+
+ sizes[i] = memparse(p, &endp);
+ if (endp == p)
+ return -EINVAL;
+ p = endp;
+ total_size += sizes[i];
+ }
+
+ if (!total_size)
+ return -EINVAL;
+
+ /* The string should be fully consumed by now. */
+ if (*p)
+ return -EINVAL;
+
+ scratch_size_lowmem = sizes[0];
+ scratch_size_global = sizes[1];
+ scratch_size_pernode = sizes[2];
+ scratch_scale = 0;
+
+ pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
+ (u64)(scratch_size_lowmem >> 20),
+ (u64)(scratch_size_global >> 20),
+ (u64)(scratch_size_pernode >> 20));
+
+ return 0;
+}
+early_param("kho_scratch", kho_parse_scratch_size);
+
+static void __init scratch_size_update(void)
+{
+ phys_addr_t size;
+
+ if (!scratch_scale)
+ return;
+
+ size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100;
+ scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100 - scratch_size_lowmem;
+ scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+static phys_addr_t __init scratch_size_node(int nid)
+{
+ phys_addr_t size;
+
+ if (scratch_scale) {
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ nid);
+ size = size * scratch_scale / 100;
+ } else {
+ size = scratch_size_pernode;
+ }
+
+ return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+/**
+ * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
+ *
+ * With KHO we can preserve arbitrary pages in the system. To ensure we still
+ * have a large contiguous region of memory when we search the physical address
+ * space for target memory, let's make sure we always have a large CMA region
+ * active. This CMA region will only be used for movable pages which are not a
+ * problem for us during KHO because we can just move them somewhere else.
+ */
+static void __init kho_reserve_scratch(void)
+{
+ phys_addr_t addr, size;
+ int nid, i = 0;
+
+ if (!kho_enable)
+ return;
+
+ scratch_size_update();
+
+ /* FIXME: deal with node hot-plug/remove */
+ kho_scratch_cnt = num_online_nodes() + 2;
+ size = kho_scratch_cnt * sizeof(*kho_scratch);
+ kho_scratch = memblock_alloc(size, PAGE_SIZE);
+ if (!kho_scratch)
+ goto err_disable_kho;
+
+ /*
+ * reserve scratch area in low memory for lowmem allocations in the
+ * next kernel
+ */
+ size = scratch_size_lowmem;
+ addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
+ ARCH_LOW_ADDRESS_LIMIT);
+ if (!addr)
+ goto err_free_scratch_desc;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ /* reserve large contiguous area for allocations without nid */
+ size = scratch_size_global;
+ addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ for_each_online_node(nid) {
+ size = scratch_size_node(nid);
+ addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, true);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+ }
+
+ return;
+
+err_free_scratch_areas:
+ for (i--; i >= 0; i--)
+ memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
+err_free_scratch_desc:
+ memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
+err_disable_kho:
+ pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
+ kho_enable = false;
+}
+
+/**
+ * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
+ * @name: name of the sub tree.
+ * @fdt: the sub tree blob.
+ *
+ * Creates a new child node named @name in KHO root FDT and records
+ * the physical address of @fdt. The pages of @fdt must also be preserved
+ * by KHO for the new kernel to retrieve it after kexec.
+ *
+ * A debugfs blob entry is also created at
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
+ * CONFIG_KEXEC_HANDOVER_DEBUGFS
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_add_subtree(const char *name, void *fdt)
+{
+ phys_addr_t phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int err = -ENOMEM;
+ int off, fdt_err;
+
+ guard(mutex)(&kho_out.lock);
+
+ fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (fdt_err < 0)
+ return err;
+
+ off = fdt_add_subnode(root_fdt, 0, name);
+ if (off < 0) {
+ if (off == -FDT_ERR_EXISTS)
+ err = -EEXIST;
+ goto out_pack;
+ }
+
+ err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ if (err < 0)
+ goto out_pack;
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
+
+out_pack:
+ fdt_pack(root_fdt);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_add_subtree);
+
+void kho_remove_subtree(void *fdt)
+{
+ phys_addr_t target_phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int off;
+ int err;
+
+ guard(mutex)(&kho_out.lock);
+
+ err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (err < 0)
+ return;
+
+ for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
+ off = fdt_next_subnode(root_fdt, off)) {
+ const u64 *val;
+ int len;
+
+ val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(phys_addr_t))
+ continue;
+
+ if ((phys_addr_t)*val == target_phys) {
+ fdt_del_node(root_fdt, off);
+ kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+ break;
+ }
+ }
+
+ fdt_pack(root_fdt);
+}
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
+
+/**
+ * kho_preserve_folio - preserve a folio across kexec.
+ * @folio: folio to preserve.
+ *
+ * Instructs KHO to preserve the whole folio across kexec. The order
+ * will be preserved as well.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
+ return -EINVAL;
+
+ return __kho_preserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_preserve_folio);
+
+/**
+ * kho_unpreserve_folio - unpreserve a folio.
+ * @folio: folio to unpreserve.
+ *
+ * Instructs KHO to unpreserve a folio that was preserved by
+ * kho_preserve_folio() before. The provided @folio (pfn and order)
+ * must exactly match a previously preserved folio.
+ */
+void kho_unpreserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ __kho_unpreserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
+/**
+ * kho_preserve_pages - preserve contiguous pages across kexec
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Preserve a contiguous list of order 0 pages. Must be restored using
+ * kho_restore_pages() to ensure the pages are restored properly as order 0.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn = start_pfn;
+ unsigned long failed_pfn = 0;
+ int err = 0;
+
+ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT))) {
+ return -EINVAL;
+ }
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ err = __kho_preserve_order(track, pfn, order);
+ if (err) {
+ failed_pfn = pfn;
+ break;
+ }
+
+ pfn += 1 << order;
+ }
+
+ if (err)
+ __kho_unpreserve(track, start_pfn, failed_pfn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_pages);
+
+/**
+ * kho_unpreserve_pages - unpreserve contiguous pages.
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
+ * This must be called with the same @page and @nr_pages as the corresponding
+ * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
+ * preserved blocks is not supported.
+ */
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ __kho_unpreserve(track, start_pfn, end_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
+
+struct kho_vmalloc_hdr {
+ DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+};
+
+#define KHO_VMALLOC_SIZE \
+ ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+ sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+ struct kho_vmalloc_hdr hdr;
+ phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+/* vmalloc flags KHO supports */
+#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+/* KHO internal flags for vmalloc preservations */
+#define KHO_VMALLOC_ALLOC 0x0001
+#define KHO_VMALLOC_HUGE_VMAP 0x0002
+
+static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
+{
+ unsigned short kho_flags = 0;
+
+ if (vm_flags & VM_ALLOC)
+ kho_flags |= KHO_VMALLOC_ALLOC;
+ if (vm_flags & VM_ALLOW_HUGE_VMAP)
+ kho_flags |= KHO_VMALLOC_HUGE_VMAP;
+
+ return kho_flags;
+}
+
+static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
+{
+ unsigned int vm_flags = 0;
+
+ if (kho_flags & KHO_VMALLOC_ALLOC)
+ vm_flags |= VM_ALLOC;
+ if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
+ vm_flags |= VM_ALLOW_HUGE_VMAP;
+
+ return vm_flags;
+}
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+ struct kho_vmalloc_chunk *chunk;
+ int err;
+
+ chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ err = kho_preserve_pages(virt_to_page(chunk), 1);
+ if (err)
+ goto err_free;
+ if (cur)
+ KHOSER_STORE_PTR(cur->hdr.next, chunk);
+ return chunk;
+
+err_free:
+ free_page((unsigned long)chunk);
+ return NULL;
+}
+
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
+ unsigned short order)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+
+ __kho_unpreserve(track, pfn, pfn + 1);
+
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
+ pfn = PHYS_PFN(chunk->phys[i]);
+ __kho_unpreserve(track, pfn, pfn + (1 << order));
+ }
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: placeholder for preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk;
+ struct vm_struct *vm = find_vm_area(ptr);
+ unsigned int order, flags, nr_contig_pages;
+ unsigned int idx = 0;
+ int err;
+
+ if (!vm)
+ return -EINVAL;
+
+ if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return -EOPNOTSUPP;
+
+ flags = vmalloc_flags_to_kho(vm->flags);
+ order = get_vm_area_page_order(vm);
+
+ chunk = new_vmalloc_chunk(NULL);
+ if (!chunk)
+ return -ENOMEM;
+ KHOSER_STORE_PTR(preservation->first, chunk);
+
+ nr_contig_pages = (1 << order);
+ for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
+ phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+ err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
+ if (err)
+ goto err_free;
+
+ chunk->phys[idx++] = phys;
+ if (idx == ARRAY_SIZE(chunk->phys)) {
+ chunk = new_vmalloc_chunk(chunk);
+ if (!chunk)
+ goto err_free;
+ idx = 0;
+ }
+ }
+
+ preservation->total_pages = vm->nr_pages;
+ preservation->flags = flags;
+ preservation->order = order;
+
+ return 0;
+
+err_free:
+ kho_unpreserve_vmalloc(preservation);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
+ * @preservation: preservation metadata returned by kho_preserve_vmalloc()
+ *
+ * Instructs KHO to unpreserve the area in vmalloc address space that was
+ * previously preserved with kho_preserve_vmalloc().
+ */
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+
+ while (chunk) {
+ struct kho_vmalloc_chunk *tmp = chunk;
+
+ kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+ unsigned int align, order, shift, vm_flags;
+ unsigned long total_pages, contig_pages;
+ unsigned long addr, size;
+ struct vm_struct *area;
+ struct page **pages;
+ unsigned int idx = 0;
+ int err;
+
+ vm_flags = kho_flags_to_vmalloc(preservation->flags);
+ if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return NULL;
+
+ total_pages = preservation->total_pages;
+ pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ order = preservation->order;
+ contig_pages = (1 << order);
+ shift = PAGE_SHIFT + order;
+ align = 1 << shift;
+
+ while (chunk) {
+ struct page *page;
+
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
+ phys_addr_t phys = chunk->phys[i];
+
+ if (idx + contig_pages > total_pages)
+ goto err_free_pages_array;
+
+ page = kho_restore_pages(phys, contig_pages);
+ if (!page)
+ goto err_free_pages_array;
+
+ for (int j = 0; j < contig_pages; j++)
+ pages[idx++] = page + j;
+
+ phys += contig_pages * PAGE_SIZE;
+ }
+
+ page = kho_restore_pages(virt_to_phys(chunk), 1);
+ if (!page)
+ goto err_free_pages_array;
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ __free_page(page);
+ }
+
+ if (idx != total_pages)
+ goto err_free_pages_array;
+
+ area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
+ vm_flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL,
+ __builtin_return_address(0));
+ if (!area)
+ goto err_free_pages_array;
+
+ addr = (unsigned long)area->addr;
+ size = get_vm_area_size(area);
+ err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+ if (err)
+ goto err_free_vm_area;
+
+ area->nr_pages = total_pages;
+ area->pages = pages;
+
+ return area->addr;
+
+err_free_vm_area:
+ free_vm_area(area);
+err_free_pages_array:
+ kvfree(pages);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
+/**
+ * kho_alloc_preserve - Allocate, zero, and preserve memory.
+ * @size: The number of bytes to allocate.
+ *
+ * Allocates a physically contiguous block of zeroed pages that is large
+ * enough to hold @size bytes. The allocated memory is then registered with
+ * KHO for preservation across a kexec.
+ *
+ * Note: The actual allocated size will be rounded up to the nearest
+ * power-of-two page boundary.
+ *
+ * @return A virtual pointer to the allocated and preserved memory on success,
+ * or an ERR_PTR() encoded error on failure.
+ */
+void *kho_alloc_preserve(size_t size)
+{
+ struct folio *folio;
+ int order, ret;
+
+ if (!size)
+ return ERR_PTR(-EINVAL);
+
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return ERR_PTR(-E2BIG);
+
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
+
+ return folio_address(folio);
+}
+EXPORT_SYMBOL_GPL(kho_alloc_preserve);
+
+/**
+ * kho_unpreserve_free - Unpreserve and free memory.
+ * @mem: Pointer to the memory allocated by kho_alloc_preserve().
+ *
+ * Unregisters the memory from KHO preservation and frees the underlying
+ * pages back to the system. This function should be called to clean up
+ * memory allocated with kho_alloc_preserve().
+ */
+void kho_unpreserve_free(void *mem)
+{
+ struct folio *folio;
+
+ if (!mem)
+ return;
+
+ folio = virt_to_folio(mem);
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_free);
+
+/**
+ * kho_restore_free - Restore and free memory after kexec.
+ * @mem: Pointer to the memory (in the new kernel's address space)
+ * that was allocated by the old kernel.
+ *
+ * This function is intended to be called in the new kernel (post-kexec)
+ * to take ownership of and free a memory region that was preserved by the
+ * old kernel using kho_alloc_preserve().
+ *
+ * It first restores the pages from KHO (using their physical address)
+ * and then frees the pages back to the new kernel's page allocator.
+ */
+void kho_restore_free(void *mem)
+{
+ struct folio *folio;
+
+ if (!mem)
+ return;
+
+ folio = kho_restore_folio(__pa(mem));
+ if (!WARN_ON(!folio))
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_restore_free);
+
+int kho_finalize(void)
+{
+ int ret;
+
+ if (!kho_enable)
+ return -EOPNOTSUPP;
+
+ guard(mutex)(&kho_out.lock);
+ ret = kho_mem_serialize(&kho_out);
+ if (ret)
+ return ret;
+
+ kho_out.finalized = true;
+
+ return 0;
+}
+
+bool kho_finalized(void)
+{
+ guard(mutex)(&kho_out.lock);
+ return kho_out.finalized;
+}
+
+struct kho_in {
+ phys_addr_t fdt_phys;
+ phys_addr_t scratch_phys;
+ struct kho_debugfs dbg;
+};
+
+static struct kho_in kho_in = {
+};
+
+static const void *kho_get_fdt(void)
+{
+ return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
+/**
+ * is_kho_boot - check if current kernel was booted via KHO-enabled
+ * kexec
+ *
+ * This function checks if the current kernel was loaded through a kexec
+ * operation with KHO enabled, by verifying that a valid KHO FDT
+ * was passed.
+ *
+ * Note: This function returns reliable results only after
+ * kho_populate() has been called during early boot. Before that,
+ * it may return false even if KHO data is present.
+ *
+ * Return: true if booted via KHO-enabled kexec, false otherwise
+ */
+bool is_kho_boot(void)
+{
+ return !!kho_get_fdt();
+}
+EXPORT_SYMBOL_GPL(is_kho_boot);
+
+/**
+ * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
+ * @name: the name of the sub FDT passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ *
+ * Retrieve a preserved sub FDT named @name and store its physical
+ * address in @phys.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+ const void *fdt = kho_get_fdt();
+ const u64 *val;
+ int offset, len;
+
+ if (!fdt)
+ return -ENOENT;
+
+ if (!phys)
+ return -EINVAL;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0)
+ return -ENOENT;
+
+ val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(*val))
+ return -EINVAL;
+
+ *phys = (phys_addr_t)*val;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+
+static __init int kho_out_fdt_setup(void)
+{
+ void *root = kho_out.fdt;
+ u64 empty_mem_map = 0;
+ int err;
+
+ err = fdt_create(root, PAGE_SIZE);
+ err |= fdt_finish_reservemap(root);
+ err |= fdt_begin_node(root, "");
+ err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
+ err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ sizeof(empty_mem_map));
+ err |= fdt_end_node(root);
+ err |= fdt_finish(root);
+
+ return err;
+}
+
+static __init int kho_init(void)
+{
+ const void *fdt = kho_get_fdt();
+ int err = 0;
+
+ if (!kho_enable)
+ return 0;
+
+ kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(kho_out.fdt)) {
+ err = PTR_ERR(kho_out.fdt);
+ goto err_free_scratch;
+ }
+
+ err = kho_debugfs_init();
+ if (err)
+ goto err_free_fdt;
+
+ err = kho_out_debugfs_init(&kho_out.dbg);
+ if (err)
+ goto err_free_fdt;
+
+ err = kho_out_fdt_setup();
+ if (err)
+ goto err_free_fdt;
+
+ if (fdt) {
+ kho_in_debugfs_init(&kho_in.dbg, fdt);
+ return 0;
+ }
+
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
+ unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
+ unsigned long pfn;
+
+ /*
+ * When debug_pagealloc is enabled, __free_pages() clears the
+ * corresponding PRESENT bit in the kernel page table.
+ * Subsequent kmemleak scans of these pages cause the
+ * non-PRESENT page faults.
+ * Mark scratch areas with kmemleak_ignore_phys() to exclude
+ * them from kmemleak scanning.
+ */
+ kmemleak_ignore_phys(kho_scratch[i].addr);
+ for (pfn = base_pfn; pfn < base_pfn + count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
+ kho_out.fdt, true));
+
+ return 0;
+
+err_free_fdt:
+ kho_unpreserve_free(kho_out.fdt);
+err_free_scratch:
+ kho_out.fdt = NULL;
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ void *start = __va(kho_scratch[i].addr);
+ void *end = start + kho_scratch[i].size;
+
+ free_reserved_area(start, end, -1, "");
+ }
+ kho_enable = false;
+ return err;
+}
+fs_initcall(kho_init);
+
+static void __init kho_release_scratch(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ memmap_init_kho_scratch_pages();
+
+ /*
+ * Mark scratch mem as CMA before we return it. That way we
+ * ensure that no kernel allocations happen on it. That means
+ * we can reuse it as scratch memory again later.
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
+ ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
+ ulong end_pfn = pageblock_align(PFN_UP(end));
+ ulong pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
+ init_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA, false);
+ }
+}
+
+void __init kho_memory_init(void)
+{
+ if (kho_in.scratch_phys) {
+ kho_scratch = phys_to_virt(kho_in.scratch_phys);
+ kho_release_scratch();
+
+ if (!kho_mem_deserialize(kho_get_fdt()))
+ kho_in.fdt_phys = 0;
+ } else {
+ kho_reserve_scratch();
+ }
+}
+
+void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+ phys_addr_t scratch_phys, u64 scratch_len)
+{
+ void *fdt = NULL;
+ struct kho_scratch *scratch = NULL;
+ int err = 0;
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+
+ /* Validate the input FDT */
+ fdt = early_memremap(fdt_phys, fdt_len);
+ if (!fdt) {
+ pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
+ err = -EFAULT;
+ goto out;
+ }
+ err = fdt_check_header(fdt);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
+ fdt_phys, err);
+ err = -EINVAL;
+ goto out;
+ }
+ err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
+ fdt_phys, KHO_FDT_COMPATIBLE, err);
+ err = -EINVAL;
+ goto out;
+ }
+
+ scratch = early_memremap(scratch_phys, scratch_len);
+ if (!scratch) {
+ pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
+ scratch_phys, scratch_len);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * We pass a safe contiguous blocks of memory to use for early boot
+ * purporses from the previous kernel so that we can resize the
+ * memblock array as needed.
+ */
+ for (int i = 0; i < scratch_cnt; i++) {
+ struct kho_scratch *area = &scratch[i];
+ u64 size = area->size;
+
+ memblock_add(area->addr, size);
+ err = memblock_mark_kho_scratch(area->addr, size);
+ if (WARN_ON(err)) {
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
+ &area->addr, &size, ERR_PTR(err));
+ goto out;
+ }
+ pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
+ }
+
+ memblock_reserve(scratch_phys, scratch_len);
+
+ /*
+ * Now that we have a viable region of scratch memory, let's tell
+ * the memblocks allocator to only use that for any allocations.
+ * That way we ensure that nothing scribbles over in use data while
+ * we initialize the page tables which we will need to ingest all
+ * memory reservations from the previous kernel.
+ */
+ memblock_set_kho_scratch_only();
+
+ kho_in.fdt_phys = fdt_phys;
+ kho_in.scratch_phys = scratch_phys;
+ kho_scratch_cnt = scratch_cnt;
+ pr_info("found kexec handover data.\n");
+
+out:
+ if (fdt)
+ early_memunmap(fdt, fdt_len);
+ if (scratch)
+ early_memunmap(scratch, scratch_len);
+ if (err)
+ pr_warn("disabling KHO revival: %d\n", err);
+}
+
+/* Helper functions for kexec_file_load */
+
+int kho_fill_kimage(struct kimage *image)
+{
+ ssize_t scratch_size;
+ int err = 0;
+ struct kexec_buf scratch;
+
+ if (!kho_enable)
+ return 0;
+
+ image->kho.fdt = virt_to_phys(kho_out.fdt);
+
+ scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
+ scratch = (struct kexec_buf){
+ .image = image,
+ .buffer = kho_scratch,
+ .bufsz = scratch_size,
+ .mem = KEXEC_BUF_MEM_UNKNOWN,
+ .memsz = scratch_size,
+ .buf_align = SZ_64K, /* Makes it easier to map */
+ .buf_max = ULONG_MAX,
+ .top_down = true,
+ };
+ err = kexec_add_buffer(&scratch);
+ if (err)
+ return err;
+ image->kho.scratch = &image->segment[image->nr_segments - 1];
+
+ return 0;
+}
+
+static int kho_walk_scratch(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ struct resource res = {
+ .start = kho_scratch[i].addr,
+ .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
+ };
+
+ /* Try to fit the kimage into our KHO scratch region */
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret;
+
+ if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
+ return 1;
+
+ ret = kho_walk_scratch(kbuf, func);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
diff --git a/kernel/liveupdate/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c
new file mode 100644
index 000000000000..6efb696f5426
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debug.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debug.c - kexec handover optional debug functionality
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include "kexec_handover_internal.h"
+
+bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ phys_addr_t scratch_start, scratch_end;
+ unsigned int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ scratch_start = kho_scratch[i].addr;
+ scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
+
+ if (phys < scratch_end && (phys + size) > scratch_start)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
new file mode 100644
index 000000000000..2abbf62ba942
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debugfs.c - kexec handover debugfs interfaces
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include "kexec_handover_internal.h"
+
+static struct dentry *debugfs_root;
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root)
+{
+ struct dentry *dir;
+
+ if (root)
+ dir = dbg->dir;
+ else
+ dir = dbg->sub_fdt_dir;
+
+ return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
+}
+
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
+{
+ struct fdt_debugfs *ff;
+
+ list_for_each_entry(ff, &dbg->fdt_list, list) {
+ if (ff->wrapper.data == fdt) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ break;
+ }
+ }
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ *val = kho_finalized();
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 val)
+{
+ if (val)
+ return kho_finalize();
+ else
+ return -EINVAL;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
+{
+ struct dentry *dir, *sub_fdt_dir;
+ int err, child;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto err_out;
+ }
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node %s prop fdt has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt %s to debugfs: %pe\n", name,
+ ERR_PTR(err));
+ continue;
+ }
+ }
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+
+ return;
+err_rmdir:
+ debugfs_remove_recursive(dir);
+err_out:
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err) {
+ pr_err("failed exposing handover FDT in debugfs: %pe\n",
+ ERR_PTR(err));
+ }
+}
+
+__init int kho_out_debugfs_init(struct kho_debugfs *dbg)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &kho_out_finalize_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+__init int kho_debugfs_init(void)
+{
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root))
+ return -ENOENT;
+ return 0;
+}
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
new file mode 100644
index 000000000000..0202c85ad14f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+#include <linux/debugfs.h>
+
+struct kho_debugfs {
+ struct dentry *dir;
+ struct dentry *sub_fdt_dir;
+ struct list_head fdt_list;
+};
+
+#else
+struct kho_debugfs {};
+#endif
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+bool kho_finalized(void);
+int kho_finalize(void);
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+int kho_debugfs_init(void);
+void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
+int kho_out_debugfs_init(struct kho_debugfs *dbg);
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root);
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
+#else
+static inline int kho_debugfs_init(void) { return 0; }
+static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
+ const void *fdt) { }
+static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
+static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root) { return 0; }
+static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
+ void *fdt) { }
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..f7ecaf7740d1
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/kobject.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
+
+static struct {
+ bool enabled;
+ void *fdt_out;
+ void *fdt_in;
+ u64 liveupdate_num;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+ return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+static int __init luo_early_startup(void)
+{
+ phys_addr_t fdt_phys;
+ int err, ln_size;
+ const void *ptr;
+
+ if (!kho_is_enabled()) {
+ if (liveupdate_enabled())
+ pr_warn("Disabling liveupdate because KHO is disabled\n");
+ luo_global.enabled = false;
+ return 0;
+ }
+
+ /* Retrieve LUO subtree, and verify its format. */
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT) {
+ pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+ LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+ }
+
+ luo_global.fdt_in = phys_to_virt(fdt_phys);
+ err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+ LUO_FDT_COMPATIBLE);
+ if (err) {
+ pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+ LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ ln_size = 0;
+ ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+ &ln_size);
+ if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+ pr_err("Unable to get live update number '%s' [%d]\n",
+ LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+ return -EINVAL;
+ }
+
+ luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+ pr_info("Retrieved live update data, liveupdate number: %lld\n",
+ luo_global.liveupdate_num);
+
+ err = luo_session_setup_incoming(luo_global.fdt_in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+ int err;
+
+ err = luo_early_startup();
+ if (err) {
+ luo_global.enabled = false;
+ luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+ ERR_PTR(err));
+ }
+
+ return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+ const u64 ln = luo_global.liveupdate_num + 1;
+ void *fdt_out;
+ int err;
+
+ fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+ if (IS_ERR(fdt_out)) {
+ pr_err("failed to allocate/preserve FDT memory\n");
+ return PTR_ERR(fdt_out);
+ }
+
+ err = fdt_create(fdt_out, LUO_FDT_SIZE);
+ err |= fdt_finish_reservemap(fdt_out);
+ err |= fdt_begin_node(fdt_out, "");
+ err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+ err |= luo_session_setup_outgoing(fdt_out);
+ err |= fdt_end_node(fdt_out);
+ err |= fdt_finish(fdt_out);
+ if (err)
+ goto exit_free;
+
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ if (err)
+ goto exit_free;
+ luo_global.fdt_out = fdt_out;
+
+ return 0;
+
+exit_free:
+ kho_unpreserve_free(fdt_out);
+ pr_err("failed to prepare LUO FDT: %d\n", err);
+
+ return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_fdt_setup();
+ if (err)
+ luo_global.enabled = false;
+
+ return err;
+}
+late_initcall(luo_late_startup);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_session_serialize();
+ if (err)
+ return err;
+
+ err = kho_finalize();
+ if (err) {
+ pr_err("kho_finalize failed %d\n", err);
+ /*
+ * kho_finalize() may return libfdt errors, to aboid passing to
+ * userspace unknown errors, change this to EAGAIN.
+ */
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+ return luo_global.enabled;
+}
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
+struct luo_device_state {
+ struct miscdevice miscdev;
+ atomic_t in_use;
+};
+
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_create(argp->name, &file);
+ if (err)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_retrieve(argp->name, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+
+ if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+ return -EBUSY;
+
+ /* Always return -EIO to user if deserialization fail */
+ if (luo_session_deserialize()) {
+ atomic_set(&ldev->in_use, 0);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+ atomic_set(&ldev->in_use, 0);
+
+ return 0;
+}
+
+union ucmd_buffer {
+ struct liveupdate_ioctl_create_session create;
+ struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+ struct liveupdate_ioctl_create_session, name),
+ IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+ struct liveupdate_ioctl_retrieve_session, name),
+};
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int err;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_BASE ||
+ (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (err)
+ return err;
+
+ op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (err)
+ return err;
+
+ return op->execute(&ucmd);
+}
+
+static const struct file_operations luo_fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .release = luo_release,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct luo_device_state luo_dev = {
+ .miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &luo_fops,
+ },
+ .in_use = ATOMIC_INIT(0),
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+ if (!liveupdate_enabled())
+ return 0;
+
+ return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
new file mode 100644
index 000000000000..ddff87917b21
--- /dev/null
+++ b/kernel/liveupdate/luo_file.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Descriptors
+ *
+ * LUO provides the infrastructure to preserve specific, stateful file
+ * descriptors across a kexec-based live update. The primary goal is to allow
+ * workloads, such as virtual machines using vfio, memfd, or iommufd, to
+ * retain access to their essential resources without interruption.
+ *
+ * The framework is built around a callback-based handler model and a well-
+ * defined lifecycle for each preserved file.
+ *
+ * Handler Registration:
+ * Kernel modules responsible for a specific file type (e.g., memfd, vfio)
+ * register a &struct liveupdate_file_handler. This handler provides a set of
+ * callbacks that LUO invokes at different stages of the update process, most
+ * notably:
+ *
+ * - can_preserve(): A lightweight check to determine if the handler is
+ * compatible with a given 'struct file'.
+ * - preserve(): The heavyweight operation that saves the file's state and
+ * returns an opaque u64 handle. This is typically performed while the
+ * workload is still active to minimize the downtime during the
+ * actual reboot transition.
+ * - unpreserve(): Cleans up any resources allocated by .preserve(), called
+ * if the preservation process is aborted before the reboot (i.e. session is
+ * closed).
+ * - freeze(): A final pre-reboot opportunity to prepare the state for kexec.
+ * We are already in reboot syscall, and therefore userspace cannot mutate
+ * the file anymore.
+ * - unfreeze(): Undoes the actions of .freeze(), called if the live update
+ * is aborted after the freeze phase.
+ * - retrieve(): Reconstructs the file in the new kernel from the preserved
+ * handle.
+ * - finish(): Performs final check and cleanup in the new kernel. After
+ * succesul finish call, LUO gives up ownership to this file.
+ *
+ * File Preservation Lifecycle happy path:
+ *
+ * 1. Preserve (Normal Operation): A userspace agent preserves files one by one
+ * via an ioctl. For each file, luo_preserve_file() finds a compatible
+ * handler, calls its .preserve() operation, and creates an internal &struct
+ * luo_file to track the live state.
+ *
+ * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called.
+ * It iterates through all preserved files, calls their respective .freeze()
+ * operation, and serializes their final metadata (compatible string, token,
+ * and data handle) into a contiguous memory block for KHO.
+ *
+ * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets
+ * deserialized (which is when /dev/liveupdate is first opened). It reads the
+ * serialized data from the KHO memory region and reconstructs the in-memory
+ * list of &struct luo_file instances for the new kernel, linking them to
+ * their corresponding handlers.
+ *
+ * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now
+ * restore file descriptors by providing a token. luo_retrieve_file()
+ * searches for the matching token, calls the handler's .retrieve() op to
+ * re-create the 'struct file', and returns a new FD. Files can be
+ * retrieved in ANY order.
+ *
+ * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete,
+ * luo_file_finish() is called. It iterates through all files, invokes their
+ * .finish() operations for final cleanup, and releases all associated kernel
+ * resources.
+ *
+ * File Preservation Lifecycle unhappy paths:
+ *
+ * 1. Abort Before Reboot: If the userspace agent aborts the live update
+ * process before calling reboot (e.g., by closing the session file
+ * descriptor), the session's release handler calls
+ * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on
+ * all preserved files, ensuring all allocated resources are cleaned up and
+ * returning the system to a clean state.
+ *
+ * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze()
+ * op fails, the .unfreeze() op is invoked on all previously *successful*
+ * freezes to roll back their state. The reboot() syscall then returns an
+ * error to userspace, canceling the live update.
+ *
+ * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails,
+ * the luo_file_finish() operation is aborted. LUO retains ownership of
+ * all files within that session, including those that were not yet
+ * processed. The userspace agent can attempt to call the finish operation
+ * again later. If the issue cannot be resolved, these resources will be held
+ * by LUO until the next live update cycle, at which point they will be
+ * discarded.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "luo_internal.h"
+
+static LIST_HEAD(luo_file_handler_list);
+
+/* 2 4K pages, give space for 128 files per file_set */
+#define LUO_FILE_PGCNT 2ul
+#define LUO_FILE_MAX \
+ ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
+
+/**
+ * struct luo_file - Represents a single preserved file instance.
+ * @fh: Pointer to the &struct liveupdate_file_handler that manages
+ * this type of file.
+ * @file: Pointer to the kernel's &struct file that is being preserved.
+ * This is NULL in the new kernel until the file is successfully
+ * retrieved.
+ * @serialized_data: The opaque u64 handle to the serialized state of the file.
+ * This handle is passed back to the handler's .freeze(),
+ * .retrieve(), and .finish() callbacks, allowing it to track
+ * and update its serialized state across phases.
+ * @private_data: Pointer to the private data for the file used to hold runtime
+ * state that is not preserved. Set by the handler's .preserve()
+ * callback, and must be freed in the handler's .unpreserve()
+ * callback.
+ * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * successfully called retrieve() on this file. This prevents
+ * multiple retrieval attempts.
+ * @mutex: A mutex that protects the fields of this specific instance
+ * (e.g., @retrieved, @file), ensuring that operations like
+ * retrieving or finishing a file are atomic.
+ * @list: The list_head linking this instance into its parent
+ * file_set's list of preserved files.
+ * @token: The user-provided unique token used to identify this file.
+ *
+ * This structure is the core in-kernel representation of a single file being
+ * managed through a live update. An instance is created by luo_preserve_file()
+ * to link a 'struct file' to its corresponding handler, a user-provided token,
+ * and the serialized state handle returned by the handler's .preserve()
+ * operation.
+ *
+ * These instances are tracked in a per-file_set list. The @serialized_data
+ * field, which holds a handle to the file's serialized state, may be updated
+ * during the .freeze() callback before being serialized for the next kernel.
+ * After reboot, these structures are recreated by luo_file_deserialize() and
+ * are finally cleaned up by luo_file_finish().
+ */
+struct luo_file {
+ struct liveupdate_file_handler *fh;
+ struct file *file;
+ u64 serialized_data;
+ void *private_data;
+ bool retrieved;
+ struct mutex mutex;
+ struct list_head list;
+ u64 token;
+};
+
+static int luo_alloc_files_mem(struct luo_file_set *file_set)
+{
+ size_t size;
+ void *mem;
+
+ if (file_set->files)
+ return 0;
+
+ WARN_ON_ONCE(file_set->count);
+
+ size = LUO_FILE_PGCNT << PAGE_SHIFT;
+ mem = kho_alloc_preserve(size);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ file_set->files = mem;
+
+ return 0;
+}
+
+static void luo_free_files_mem(struct luo_file_set *file_set)
+{
+ /* If file_set has files, no need to free preservation memory */
+ if (file_set->count)
+ return;
+
+ if (!file_set->files)
+ return;
+
+ kho_unpreserve_free(file_set->files);
+ file_set->files = NULL;
+}
+
+static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
+{
+ struct luo_file *iter;
+
+ list_for_each_entry(iter, &file_set->files_list, list) {
+ if (iter->token == token)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * luo_preserve_file - Initiate the preservation of a file descriptor.
+ * @file_set: The file_set to which the preserved file will be added.
+ * @token: A unique, user-provided identifier for the file.
+ * @fd: The file descriptor to be preserved.
+ *
+ * This function orchestrates the first phase of preserving a file. Upon entry,
+ * it takes a reference to the 'struct file' via fget(), effectively making LUO
+ * a co-owner of the file. This reference is held until the file is either
+ * unpreserved or successfully finished in the next kernel, preventing the file
+ * from being prematurely destroyed.
+ *
+ * This function orchestrates the first phase of preserving a file. It performs
+ * the following steps:
+ *
+ * 1. Validates that the @token is not already in use within the file_set.
+ * 2. Ensures the file_set's memory for files serialization is allocated
+ * (allocates if needed).
+ * 3. Iterates through registered handlers, calling can_preserve() to find one
+ * compatible with the given @fd.
+ * 4. Calls the handler's .preserve() operation, which saves the file's state
+ * and returns an opaque private data handle.
+ * 5. Adds the new instance to the file_set's internal list.
+ *
+ * On success, LUO takes a reference to the 'struct file' and considers it
+ * under its management until it is unpreserved or finished.
+ *
+ * In case of any failure, all intermediate allocations (file reference, memory
+ * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
+ *
+ * Context: Can be called from an ioctl handler during normal system operation.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EEXIST if the token is already used.
+ * -EBADF if the file descriptor is invalid.
+ * -ENOSPC if the file_set is full.
+ * -ENOENT if no compatible handler is found.
+ * -ENOMEM on memory allocation failure.
+ * Other erros might be returned by .preserve().
+ */
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct liveupdate_file_handler *fh;
+ struct luo_file *luo_file;
+ struct file *file;
+ int err;
+
+ if (luo_token_is_used(file_set, token))
+ return -EEXIST;
+
+ if (file_set->count == LUO_FILE_MAX)
+ return -ENOSPC;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ err = luo_alloc_files_mem(file_set);
+ if (err)
+ goto err_fput;
+
+ err = -ENOENT;
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (fh->ops->can_preserve(fh, file)) {
+ err = 0;
+ break;
+ }
+ }
+
+ /* err is still -ENOENT if no handler was found */
+ if (err)
+ goto err_free_files_mem;
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file) {
+ err = -ENOMEM;
+ goto err_free_files_mem;
+ }
+
+ luo_file->file = file;
+ luo_file->fh = fh;
+ luo_file->token = token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+
+ args.handler = fh;
+ args.file = file;
+ err = fh->ops->preserve(&args);
+ if (err)
+ goto err_kfree;
+
+ luo_file->serialized_data = args.serialized_data;
+ luo_file->private_data = args.private_data;
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ file_set->count++;
+
+ return 0;
+
+err_kfree:
+ kfree(luo_file);
+err_free_files_mem:
+ luo_free_files_mem(file_set);
+err_fput:
+ fput(file);
+
+ return err;
+}
+
+/**
+ * luo_file_unpreserve_files - Unpreserves all files from a file_set.
+ * @file_set: The files to be cleaned up.
+ *
+ * This function serves as the primary cleanup path for a file_set. It is
+ * invoked when the userspace agent closes the file_set's file descriptor.
+ *
+ * For each file, it performs the following cleanup actions:
+ * 1. Calls the handler's .unpreserve() callback to allow the handler to
+ * release any resources it allocated.
+ * 2. Removes the file from the file_set's internal tracking list.
+ * 3. Releases the reference to the 'struct file' that was taken by
+ * luo_preserve_file() via fput(), returning ownership.
+ * 4. Frees the memory associated with the internal 'struct luo_file'.
+ *
+ * After all individual files are unpreserved, it frees the contiguous memory
+ * block that was allocated to hold their serialization data.
+ */
+void luo_file_unpreserve_files(struct luo_file_set *file_set)
+{
+ struct luo_file *luo_file;
+
+ while (!list_empty(&file_set->files_list)) {
+ struct liveupdate_file_op_args args = {0};
+
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+ luo_file->fh->ops->unpreserve(&args);
+
+ list_del(&luo_file->list);
+ file_set->count--;
+
+ fput(luo_file->file);
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ luo_free_files_mem(file_set);
+}
+
+static int luo_file_freeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ int err = 0;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->freeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ err = luo_file->fh->ops->freeze(&args);
+ if (!err)
+ luo_file->serialized_data = args.serialized_data;
+ }
+
+ return err;
+}
+
+static void luo_file_unfreeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->unfreeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ luo_file->fh->ops->unfreeze(&args);
+ }
+
+ luo_file->serialized_data = 0;
+}
+
+static void __luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file *failed_entry)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ if (luo_file == failed_entry)
+ break;
+
+ luo_file_unfreeze_one(file_set, luo_file);
+ }
+
+ memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT);
+}
+
+/**
+ * luo_file_freeze - Freezes all preserved files and serializes their metadata.
+ * @file_set: The file_set whose files are to be frozen.
+ * @file_set_ser: Where to put the serialized file_set.
+ *
+ * This function is called from the reboot() syscall path, just before the
+ * kernel transitions to the new image via kexec. Its purpose is to perform the
+ * final preparation and serialization of all preserved files in the file_set.
+ *
+ * It iterates through each preserved file in FIFO order (the order of
+ * preservation) and performs two main actions:
+ *
+ * 1. Freezes the File: It calls the handler's .freeze() callback for each
+ * file. This gives the handler a final opportunity to quiesce the device or
+ * prepare its state for the upcoming reboot. The handler may update its
+ * private data handle during this step.
+ *
+ * 2. Serializes Metadata: After a successful freeze, it copies the final file
+ * metadata—the handler's compatible string, the user token, and the final
+ * private data handle—into the pre-allocated contiguous memory buffer
+ * (file_set->files) that will be handed over to the next kernel via KHO.
+ *
+ * Error Handling (Rollback):
+ * This function is atomic. If any handler's .freeze() operation fails, the
+ * entire live update is aborted. The __luo_file_unfreeze() helper is
+ * immediately called to invoke the .unfreeze() op on all files that were
+ * successfully frozen before the point of failure, rolling them back to a
+ * running state. The function then returns an error, causing the reboot()
+ * syscall to fail.
+ *
+ * Context: Called only from the liveupdate_reboot() path.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser = file_set->files;
+ struct luo_file *luo_file;
+ int err;
+ int i;
+
+ if (!file_set->count)
+ return 0;
+
+ if (WARN_ON(!file_ser))
+ return -EINVAL;
+
+ i = 0;
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ err = luo_file_freeze_one(file_set, luo_file);
+ if (err < 0) {
+ pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n",
+ luo_file->token, luo_file->fh->compatible,
+ ERR_PTR(err));
+ goto err_unfreeze;
+ }
+
+ strscpy(file_ser[i].compatible, luo_file->fh->compatible,
+ sizeof(file_ser[i].compatible));
+ file_ser[i].data = luo_file->serialized_data;
+ file_ser[i].token = luo_file->token;
+ i++;
+ }
+
+ file_set_ser->count = file_set->count;
+ if (file_set->files)
+ file_set_ser->files = virt_to_phys(file_set->files);
+
+ return 0;
+
+err_unfreeze:
+ __luo_file_unfreeze(file_set, luo_file);
+
+ return err;
+}
+
+/**
+ * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization
+ * @file_set: The file_set whose files are to be unfrozen.
+ * @file_set_ser: Serialized file_set.
+ *
+ * This function rolls back the state of all files in a file_set after the
+ * freeze phase has begun but must be aborted. It is the counterpart to
+ * luo_file_freeze().
+ *
+ * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
+ * signals the helper to iterate through all files in the file_set and call
+ * their respective .unfreeze() handler callbacks.
+ *
+ * Context: This is called when the live update is aborted during
+ * the reboot() syscall, after luo_file_freeze() has been called.
+ */
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ if (!file_set->count)
+ return;
+
+ __luo_file_unfreeze(file_set, NULL);
+ memset(file_set_ser, 0, sizeof(*file_set_ser));
+}
+
+/**
+ * luo_retrieve_file - Restores a preserved file from a file_set by its token.
+ * @file_set: The file_set from which to retrieve the file.
+ * @token: The unique token identifying the file to be restored.
+ * @filep: Output parameter; on success, this is populated with a pointer
+ * to the newly retrieved 'struct file'.
+ *
+ * This function is the primary mechanism for recreating a file in the new
+ * kernel after a live update. It searches the file_set's list of deserialized
+ * files for an entry matching the provided @token.
+ *
+ * The operation is idempotent: if a file has already been successfully
+ * retrieved, this function will simply return a pointer to the existing
+ * 'struct file' and report success without re-executing the retrieve
+ * operation. This is handled by checking the 'retrieved' flag under a lock.
+ *
+ * File retrieval can happen in any order; it is not bound by the order of
+ * preservation.
+ *
+ * Context: Can be called from an ioctl or other in-kernel code in the new
+ * kernel.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -ENOENT if no file with the matching token is found.
+ * Any error code returned by the handler's .retrieve() op.
+ */
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct luo_file *luo_file;
+ int err;
+
+ if (list_empty(&file_set->files_list))
+ return -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->token == token)
+ break;
+ }
+
+ if (luo_file->token != token)
+ return -ENOENT;
+
+ guard(mutex)(&luo_file->mutex);
+ if (luo_file->retrieved) {
+ /*
+ * Someone is asking for this file again, so get a reference
+ * for them.
+ */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ return 0;
+ }
+
+ args.handler = luo_file->fh;
+ args.serialized_data = luo_file->serialized_data;
+ err = luo_file->fh->ops->retrieve(&args);
+ if (!err) {
+ luo_file->file = args.file;
+
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieved = true;
+ }
+
+ return err;
+}
+
+static int luo_file_can_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ bool can_finish = true;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->can_finish) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+ can_finish = luo_file->fh->ops->can_finish(&args);
+ }
+
+ return can_finish ? 0 : -EBUSY;
+}
+
+static void luo_file_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ struct liveupdate_file_op_args args = {0};
+
+ guard(mutex)(&luo_file->mutex);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+
+ luo_file->fh->ops->finish(&args);
+}
+
+/**
+ * luo_file_finish - Completes the lifecycle for all files in a file_set.
+ * @file_set: The file_set to be finalized.
+ *
+ * This function orchestrates the final teardown of a live update file_set in
+ * the new kernel. It should be called after all necessary files have been
+ * retrieved and the userspace agent is ready to release the preserved state.
+ *
+ * The function iterates through all tracked files. For each file, it performs
+ * the following sequence of cleanup actions:
+ *
+ * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
+ * every file in the file_set. If all can_finish return true, continue to
+ * finish.
+ * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
+ * allow for final resource cleanup within the handler.
+ * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
+ * is the counterpart to the get_file() call in luo_retrieve_file().
+ * 4. Removes the 'struct luo_file' from the file_set's internal list.
+ * 5. Frees the memory for the 'struct luo_file' instance itself.
+ *
+ * After successfully finishing all individual files, it frees the
+ * contiguous memory block that was used to transfer the serialized metadata
+ * from the previous kernel.
+ *
+ * Error Handling (Atomic Failure):
+ * This operation is atomic. If any handler's .can_finish() op fails, the entire
+ * function aborts immediately and returns an error.
+ *
+ * Context: Can be called from an ioctl handler in the new kernel.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_finish(struct luo_file_set *file_set)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+ int err;
+
+ if (!file_set->count)
+ return 0;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ err = luo_file_can_finish_one(file_set, luo_file);
+ if (err)
+ return err;
+ }
+
+ while (!list_empty(&file_set->files_list)) {
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ luo_file_finish_one(file_set, luo_file);
+
+ if (luo_file->file)
+ fput(luo_file->file);
+ list_del(&luo_file->list);
+ file_set->count--;
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ if (file_set->files) {
+ kho_restore_free(file_set->files);
+ file_set->files = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
+ * @file_set: The incoming file_set to fill with deserialized data.
+ * @file_set_ser: Serialized KHO file_set data from the previous kernel.
+ *
+ * This function is called during the early boot process of the new kernel. It
+ * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
+ * provided by the previous kernel, and transforms it back into a live,
+ * in-memory linked list of 'struct luo_file' instances.
+ *
+ * For each serialized entry, it performs the following steps:
+ * 1. Reads the 'compatible' string.
+ * 2. Searches the global list of registered file handlers for one that
+ * matches the compatible string.
+ * 3. Allocates a new 'struct luo_file'.
+ * 4. Populates the new structure with the deserialized data (token, private
+ * data handle) and links it to the found handler. The 'file' pointer is
+ * initialized to NULL, as the file has not been retrieved yet.
+ * 5. Adds the new 'struct luo_file' to the file_set's files_list.
+ *
+ * This prepares the file_set for userspace, which can later call
+ * luo_retrieve_file() to restore the actual file descriptors.
+ *
+ * Context: Called from session deserialization.
+ */
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser;
+ u64 i;
+
+ if (!file_set_ser->files) {
+ WARN_ON(file_set_ser->count);
+ return 0;
+ }
+
+ file_set->count = file_set_ser->count;
+ file_set->files = phys_to_virt(file_set_ser->files);
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of files that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ file_ser = file_set->files;
+ for (i = 0; i < file_set->count; i++) {
+ struct liveupdate_file_handler *fh;
+ bool handler_found = false;
+ struct luo_file *luo_file;
+
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (!strcmp(fh->compatible, file_ser[i].compatible)) {
+ handler_found = true;
+ break;
+ }
+ }
+
+ if (!handler_found) {
+ pr_warn("No registered handler for compatible '%s'\n",
+ file_ser[i].compatible);
+ return -ENOENT;
+ }
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file)
+ return -ENOMEM;
+
+ luo_file->fh = fh;
+ luo_file->file = NULL;
+ luo_file->serialized_data = file_ser[i].data;
+ luo_file->token = file_ser[i].token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ }
+
+ return 0;
+}
+
+void luo_file_set_init(struct luo_file_set *file_set)
+{
+ INIT_LIST_HEAD(&file_set->files_list);
+}
+
+void luo_file_set_destroy(struct luo_file_set *file_set)
+{
+ WARN_ON(file_set->count);
+ WARN_ON(!list_empty(&file_set->files_list));
+}
+
+/**
+ * liveupdate_register_file_handler - Register a file handler with LUO.
+ * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
+ * The caller must initialize this structure, including a unique
+ * 'compatible' string and a valid 'fh' callbacks. This function adds the
+ * handler to the global list of supported file handlers.
+ *
+ * Context: Typically called during module initialization for file types that
+ * support live update preservation.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+ struct liveupdate_file_handler *fh_iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /* Sanity check that all required callbacks are set */
+ if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve ||
+ !fh->ops->finish || !fh->ops->can_preserve) {
+ return -EINVAL;
+ }
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check for duplicate compatible strings */
+ luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ if (!strcmp(fh_iter->compatible, fh->compatible)) {
+ pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
+ fh->compatible);
+ err = -EEXIST;
+ goto err_resume;
+ }
+ }
+
+ /* Pin the module implementing the handler */
+ if (!try_module_get(fh->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
+ list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
+ * @fh: The file handler to unregister
+ *
+ * Unregisters the file handler from the liveupdate core. This function
+ * reverses the operations of liveupdate_register_file_handler().
+ *
+ * It ensures safe removal by checking that:
+ * No live update session is currently in progress.
+ *
+ * If the unregistration fails, the internal test state is reverted.
+ *
+ * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
+ * update is in progress, can't quiesce live update.
+ */
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ list_del(&ACCESS_PRIVATE(fh, list));
+ module_put(fh->ops->owner);
+ luo_session_resume();
+
+ return 0;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..c8973b543d1d
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+ size_t kernel_cmd_size)
+{
+ /*
+ * Copy the minimum of what the user provided and what we actually
+ * have.
+ */
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+/* Mimics list_for_each_entry() but for private list head entries */
+#define luo_list_for_each_private(pos, head, member) \
+ for (struct list_head *__iter = (head)->next; \
+ __iter != (head) && \
+ ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
+ __iter = __iter->next)
+
+/**
+ * struct luo_file_set - A set of files that belong to the same sessions.
+ * @files_list: An ordered list of files associated with this session, it is
+ * ordered by preservation time.
+ * @files: The physically contiguous memory block that holds the serialized
+ * state of files.
+ * @count: A counter tracking the number of files currently stored in the
+ * @files_list for this session.
+ */
+struct luo_file_set {
+ struct list_head files_list;
+ struct luo_file_ser *files;
+ long count;
+};
+
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name: A unique name for this session, used for identification and
+ * retrieval.
+ * @ser: Pointer to the serialized data for this session.
+ * @list: A list_head member used to link this session into a global list
+ * of either outgoing (to be preserved) or incoming (restored from
+ * previous kernel) sessions.
+ * @retrieved: A boolean flag indicating whether this session has been
+ * retrieved by a consumer in the new kernel.
+ * @file_set: A set of files that belong to this session.
+ * @mutex: protects fields in the luo_session.
+ */
+struct luo_session {
+ char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+ struct luo_session_ser *ser;
+ struct list_head list;
+ bool retrieved;
+ struct luo_file_set file_set;
+ struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+void luo_file_unpreserve_files(struct luo_file_set *file_set);
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep);
+int luo_file_finish(struct luo_file_set *file_set);
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_set_init(struct luo_file_set *file_set);
+void luo_file_set_destroy(struct luo_file_set *file_set);
+
+#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..dbdbc3bd7929
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ * which is used for both creation in the current kernel and retrieval in the
+ * next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ * ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ * a live update is triggered via kexec, an array of `struct luo_session_ser`
+ * is populated and placed in a preserved memory region. An FDT node is also
+ * created, containing the count of sessions and the physical address of this
+ * array.
+ *
+ * Session Lifecycle:
+ *
+ * 1. Creation: A userspace agent calls `luo_session_create()` to create a
+ * new, empty session and receives a file descriptor for it.
+ *
+ * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ * made, `luo_session_serialize()` is called. It iterates through all
+ * active sessions and writes their metadata into a memory area preserved
+ * by KHO.
+ *
+ * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ * runs, reading the serialized data and creating a list of `struct
+ * luo_session` objects representing the preserved sessions.
+ *
+ * 4. Retrieval: A userspace agent in the new kernel can then call
+ * `luo_session_retrieve()` with a session name to get a new file
+ * descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT 16ul
+#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_session_header_ser)) / \
+ sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count: The number of sessions currently tracked in the @list.
+ * @list: The head of the linked list of `struct luo_session` instances.
+ * @rwsem: A read-write semaphore providing synchronized access to the
+ * session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser: The serialized session data (an array of
+ * `struct luo_session_ser`).
+ * @active: Set to true when first initialized. If previous kernel did not
+ * send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+ long count;
+ struct list_head list;
+ struct rw_semaphore rwsem;
+ struct luo_session_header_ser *header_ser;
+ struct luo_session_ser *ser;
+ bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming: The sessions passed from the previous kernel.
+ * @outgoing: The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+ struct luo_session_header incoming;
+ struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+ .incoming = {
+ .list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+ },
+ .outgoing = {
+ .list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+ },
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+ struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ strscpy(session->name, name, sizeof(session->name));
+ INIT_LIST_HEAD(&session->file_set.files_list);
+ luo_file_set_init(&session->file_set);
+ INIT_LIST_HEAD(&session->list);
+ mutex_init(&session->mutex);
+
+ return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+ luo_file_set_destroy(&session->file_set);
+ mutex_destroy(&session->mutex);
+ kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ struct luo_session *it;
+
+ guard(rwsem_write)(&sh->rwsem);
+
+ /*
+ * For outgoing we should make sure there is room in serialization array
+ * for new session.
+ */
+ if (sh == &luo_session_global.outgoing) {
+ if (sh->count == LUO_SESSION_MAX)
+ return -ENOMEM;
+ }
+
+ /*
+ * For small number of sessions this loop won't hurt performance
+ * but if we ever start using a lot of sessions, this might
+ * become a bottle neck during deserialization time, as it would
+ * cause O(n*n) complexity.
+ */
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, session->name, sizeof(it->name)))
+ return -EEXIST;
+ }
+ list_add_tail(&session->list, &sh->list);
+ sh->count++;
+
+ return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ guard(rwsem_write)(&sh->rwsem);
+ list_del(&session->list);
+ sh->count--;
+}
+
+static int luo_session_finish_one(struct luo_session *session)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_finish(&session->file_set);
+}
+
+static void luo_session_unfreeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ luo_file_unfreeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_freeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_freeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_session *session = filep->private_data;
+ struct luo_session_header *sh;
+
+ /* If retrieved is set, it means this session is from incoming list */
+ if (session->retrieved) {
+ int err = luo_session_finish_one(session);
+
+ if (err) {
+ pr_warn("Unable to finish session [%s] on release\n",
+ session->name);
+ return err;
+ }
+ sh = &luo_session_global.incoming;
+ } else {
+ scoped_guard(mutex, &session->mutex)
+ luo_file_unpreserve_files(&session->file_set);
+ sh = &luo_session_global.outgoing;
+ }
+
+ luo_session_remove(sh, session);
+ luo_session_free(session);
+
+ return 0;
+}
+
+static int luo_session_preserve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_preserve_fd *argp = ucmd->cmd;
+ int err;
+
+ guard(mutex)(&session->mutex);
+ err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+ if (err)
+ return err;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ pr_warn("The file was successfully preserved, but response to user failed\n");
+
+ return err;
+}
+
+static int luo_session_retrieve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_retrieve_fd *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ guard(mutex)(&session->mutex);
+ err = luo_retrieve_file(&session->file_set, argp->token, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_session_finish(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_finish *argp = ucmd->cmd;
+ int err = luo_session_finish_one(session);
+
+ if (err)
+ return err;
+
+ return luo_ucmd_respond(ucmd, sizeof(*argp));
+}
+
+union ucmd_buffer {
+ struct liveupdate_session_finish finish;
+ struct liveupdate_session_preserve_fd preserve;
+ struct liveupdate_session_retrieve_fd retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_session_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish,
+ struct liveupdate_session_finish, reserved),
+ IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd,
+ struct liveupdate_session_preserve_fd, token),
+ IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd,
+ struct liveupdate_session_retrieve_fd, token),
+};
+
+static long luo_session_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct luo_session *session = filep->private_data;
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >=
+ ARRAY_SIZE(luo_session_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+
+ return op->execute(session, &ucmd);
+}
+
+static const struct file_operations luo_session_fops = {
+ .owner = THIS_MODULE,
+ .release = luo_session_release,
+ .unlocked_ioctl = luo_session_ioctl,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+ char name_buf[128];
+ struct file *file;
+
+ lockdep_assert_held(&session->mutex);
+ snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+ file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ *filep = file;
+
+ return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+ struct luo_session *session;
+ int err;
+
+ session = luo_session_alloc(name);
+ if (IS_ERR(session))
+ return PTR_ERR(session);
+
+ err = luo_session_insert(&luo_session_global.outgoing, session);
+ if (err)
+ goto err_free;
+
+ scoped_guard(mutex, &session->mutex)
+ err = luo_session_getfile(session, filep);
+ if (err)
+ goto err_remove;
+
+ return 0;
+
+err_remove:
+ luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+ luo_session_free(session);
+
+ return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ struct luo_session *session = NULL;
+ struct luo_session *it;
+ int err;
+
+ scoped_guard(rwsem_read, &sh->rwsem) {
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, name, sizeof(it->name))) {
+ session = it;
+ break;
+ }
+ }
+ }
+
+ if (!session)
+ return -ENOENT;
+
+ guard(mutex)(&session->mutex);
+ if (session->retrieved)
+ return -EINVAL;
+
+ err = luo_session_getfile(session, filep);
+ if (!err)
+ session->retrieved = true;
+
+ return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+ struct luo_session_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_SESSION_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ luo_session_global.outgoing.header_ser = header_ser;
+ luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_session_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+ return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+ struct luo_session_header_ser *header_ser;
+ int err, header_size, offset;
+ u64 header_ser_pa;
+ const void *ptr;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get session node: [%s]\n",
+ LUO_FDT_SESSION_NODE_NAME);
+ return -EINVAL;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_SESSION_COMPATIBLE);
+ if (err) {
+ pr_err("Session node incompatible [%s]\n",
+ LUO_FDT_SESSION_COMPATIBLE);
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get session header '%s' [%d]\n",
+ LUO_FDT_SESSION_HEADER, header_size);
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_session_global.incoming.header_ser = header_ser;
+ luo_session_global.incoming.ser = (void *)(header_ser + 1);
+ luo_session_global.incoming.active = true;
+
+ return 0;
+}
+
+int luo_session_deserialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ static bool is_deserialized;
+ static int err;
+
+ /* If has been deserialized, always return the same error code */
+ if (is_deserialized)
+ return err;
+
+ is_deserialized = true;
+ if (!sh->active)
+ return 0;
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of sessions that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ for (int i = 0; i < sh->header_ser->count; i++) {
+ struct luo_session *session;
+
+ session = luo_session_alloc(sh->ser[i].name);
+ if (IS_ERR(session)) {
+ pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+ sh->ser[i].name, session);
+ return PTR_ERR(session);
+ }
+
+ err = luo_session_insert(sh, session);
+ if (err) {
+ pr_warn("Failed to insert session [%s] %pe\n",
+ session->name, ERR_PTR(err));
+ luo_session_free(session);
+ return err;
+ }
+
+ scoped_guard(mutex, &session->mutex) {
+ luo_file_deserialize(&session->file_set,
+ &sh->ser[i].file_set_ser);
+ }
+ }
+
+ kho_restore_free(sh->header_ser);
+ sh->header_ser = NULL;
+ sh->ser = NULL;
+
+ return 0;
+}
+
+int luo_session_serialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.outgoing;
+ struct luo_session *session;
+ int i = 0;
+ int err;
+
+ guard(rwsem_write)(&sh->rwsem);
+ list_for_each_entry(session, &sh->list, list) {
+ err = luo_session_freeze_one(session, &sh->ser[i]);
+ if (err)
+ goto err_undo;
+
+ strscpy(sh->ser[i].name, session->name,
+ sizeof(sh->ser[i].name));
+ i++;
+ }
+ sh->header_ser->count = sh->count;
+
+ return 0;
+
+err_undo:
+ list_for_each_entry_continue_reverse(session, &sh->list, list) {
+ i--;
+ luo_session_unfreeze_one(session, &sh->ser[i]);
+ memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name));
+ }
+
+ return err;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ *
+ * Return:
+ * true - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+ down_write(&luo_session_global.incoming.rwsem);
+ down_write(&luo_session_global.outgoing.rwsem);
+
+ if (luo_session_global.incoming.count ||
+ luo_session_global.outgoing.count) {
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+}
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index d51cabf28f38..a114949eeed5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
+KASAN_SANITIZE_lockdep.o := n
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
int i;
- if (!d_counts)
+ if (IS_ERR(d_counts))
goto out;
/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
for (i = 0; i < lockevent_num; i++) {
if (skip_lockevent(lockevent_names[i]))
continue;
- if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
- (void *)(long)i, &fops_lockevent))
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
goto fail_undo;
}
- if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
d_counts, (void *)(long)LOCKEVENT_reset_cnts,
- &fops_lockevent))
+ &fops_lockevent)))
goto fail_undo;
return 0;
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 8c7e7d25f09c..d2345e9c0190 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -53,8 +53,12 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#else /* CONFIG_LOCK_EVENT_COUNTS */
#define lockevent_inc(ev)
-#define lockevent_add(ev, c)
-#define lockevent_cond_inc(ev, c)
+#define lockevent_add(ev, c) do { (void)(c); } while (0)
+#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 97fb6f3f840a..4e36258cc34f 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
#endif /* CONFIG_QUEUED_SPINLOCKS */
/*
+ * Locking events for Resilient Queued Spin Lock
+ */
+LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */
+
+/*
* Locking events for rwsem
*/
LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
@@ -67,3 +72,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+
+/*
+ * Locking events for rtlock_slowlock()
+ */
+LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */
+LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */
+LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */
+
+/*
+ * Locking events for rt_mutex_slowlock()
+ */
+LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */
+LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */
+LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */
+LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */
+LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */
+LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */
+
+/*
+ * Locking events for lockdep
+ */
+LOCK_EVENT(lockdep_acquire)
+LOCK_EVENT(lockdep_lock)
+LOCK_EVENT(lockdep_nocheck)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4a882f83aeb9..2d4c5bab5af8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,28 +55,61 @@
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
+#include <linux/console.h>
+#include <linux/kasan.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#include "lock_events.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
DEFINE_PER_CPU(unsigned int, lockdep_recursion);
EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
@@ -126,10 +159,12 @@ static inline void lockdep_unlock(void)
__this_cpu_dec(lockdep_recursion);
}
+#ifdef CONFIG_PROVE_LOCKING
static inline bool lockdep_assert_locked(void)
{
return DEBUG_LOCKS_WARN_ON(__owner != current);
}
+#endif
static struct task_struct *lockdep_selftest_task_struct;
@@ -137,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
lockdep_lock();
+ lockevent_inc(lockdep_lock);
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -183,11 +219,10 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long nr_dynamic_keys;
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -262,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
dst->nr += src->nr;
}
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
{
- struct lock_class_stats stats;
int cpu, i;
- memset(&stats, 0, sizeof(struct lock_class_stats));
+ memset(stats, 0, sizeof(struct lock_class_stats));
for_each_possible_cpu(cpu) {
struct lock_class_stats *pcs =
&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+ stats->contention_point[i] += pcs->contention_point[i];
- for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
- stats.contending_point[i] += pcs->contending_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+ stats->contending_point[i] += pcs->contending_point[i];
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+ lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats->write_waittime);
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+ lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
- for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
- stats.bounces[i] += pcs->bounces[i];
+ for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+ stats->bounces[i] += pcs->bounces[i];
}
-
- return stats;
}
void clear_lock_stats(struct lock_class *class)
@@ -338,7 +370,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -401,7 +433,7 @@ static inline u16 hlock_id(struct held_lock *hlock)
return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
}
-static inline unsigned int chain_hlock_class_idx(u16 hlock_id)
+static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id)
{
return hlock_id & (MAX_LOCKDEP_KEYS - 1);
}
@@ -545,8 +577,10 @@ static struct lock_trace *save_trace(void)
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
@@ -680,7 +714,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -695,17 +729,19 @@ static void __print_lock_name(struct lock_class *class)
printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
printk(KERN_CONT " (");
- __print_lock_name(class);
+ __print_lock_name(hlock, class);
printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
@@ -743,7 +779,7 @@ static void print_lock(struct held_lock *hlock)
}
printk(KERN_CONT "%px", hlock->instance);
- print_lock_name(lock);
+ print_lock_name(hlock, lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -755,7 +791,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
else
printk("%d lock%s held by %s/%d:\n", depth,
- depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ str_plural(depth), p->comm, task_pid_nr(p));
/*
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
@@ -788,34 +824,26 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-/*
- * Check if an address is part of freed initmem. After initmem is freed,
- * memory can be allocated from it, and such allocations would then have
- * addresses within the range [_stext, _end].
- */
-#ifndef arch_is_kernel_initmem_freed
-static int arch_is_kernel_initmem_freed(unsigned long addr)
-{
- if (system_state < SYSTEM_FREEING_INITMEM)
- return 0;
-
- return init_section_contains((void *)addr, 1);
-}
-#endif
-
static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
- if (arch_is_kernel_initmem_freed(addr))
- return 0;
+ if (is_kernel_core_data(addr))
+ return 1;
+
+ /*
+ * keys are allowed in the __ro_after_init section.
+ */
+ if (is_kernel_rodata(addr))
+ return 1;
/*
- * static variable?
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
*/
- if ((addr >= start) && (addr < end))
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -865,11 +893,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
instrumentation_begin();
debug_locks_off();
+ nbcon_cpu_emergency_enter();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
instrumentation_end();
return NULL;
}
@@ -906,8 +936,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name &&
- lock->key != &__lockdep_no_validate__);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
@@ -944,11 +976,13 @@ static bool assign_lock_key(struct lockdep_map *lock)
else {
/* Debug-check: all keys must be persistent! */
debug_locks_off();
+ nbcon_cpu_emergency_enter();
pr_err("INFO: trying to register non-static key.\n");
pr_err("The code is fine but needs lockdep annotation, or maybe\n");
pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
return false;
}
@@ -1202,6 +1236,7 @@ void lockdep_register_key(struct lock_class_key *key)
goto out_unlock;
}
hlist_add_head_rcu(&key->hash_entry, hash_head);
+ nr_dynamic_keys++;
out_unlock:
graph_unlock();
restore_irqs:
@@ -1252,6 +1287,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1291,8 +1327,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
return NULL;
}
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
nr_lock_classes++;
@@ -1317,15 +1355,20 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
+ nbcon_cpu_emergency_enter();
printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
if (!graph_lock()) {
return NULL;
@@ -1364,8 +1407,10 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
nr_list_entries++;
@@ -1378,7 +1423,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, u16 distance, u8 dep,
+ u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1833,7 +1878,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
if (debug_locks_silent)
return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
+ print_lock_name(NULL, target->class);
printk(KERN_CONT ":\n");
print_lock_trace(target->trace, 6);
}
@@ -1846,6 +1891,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1862,28 +1909,36 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT " --> ");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT " --> ");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
+ __print_lock_name(src, source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1921,41 +1976,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
}
/*
- * We are about to add A -> B into the dependency graph, and in __bfs() a
- * strong dependency path A -> .. -> B is found: hlock_class equals
- * entry->class.
- *
- * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
- * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
- * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
- * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
- * dependency graph, as any strong path ..-> A -> B ->.. we can get with
- * having dependency A -> B, we could already get a equivalent path ..-> A ->
- * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
- *
- * We need to make sure both the start and the end of A -> .. -> B is not
- * weaker than A -> B. For the start part, please see the comment in
- * check_redundant(). For the end part, we need:
- *
- * Either
- *
- * a) A -> B is -(*R)-> (everything is not weaker than that)
- *
- * or
- *
- * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
- *
- */
-static inline bool hlock_equal(struct lock_list *entry, void *data)
-{
- struct held_lock *hlock = (struct held_lock *)data;
-
- return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
- (hlock->read == 2 || /* A -> B is -(*R)-> */
- !entry->only_xr); /* A -> .. -> B is -(*N)-> */
-}
-
-/*
* We are about to add B -> A into the dependency graph, and in __bfs() a
* strong dependency path A -> .. -> B is found: hlock_class equals
* entry->class.
@@ -2001,6 +2021,8 @@ static noinline void print_circular_bug(struct lock_list *this,
depth = get_lock_depth(target);
+ nbcon_cpu_emergency_enter();
+
print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target);
@@ -2019,6 +2041,8 @@ static noinline void print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static noinline void print_bfs_bug(int ret)
@@ -2029,6 +2053,9 @@ static noinline void print_bfs_bug(int ret)
/*
* Breadth-first-search failed, graph got corrupted?
*/
+ if (ret == BFS_EQUEUEFULL)
+ pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n");
+
WARN(1, "lockdep bfs error:%d\n", ret);
}
@@ -2109,6 +2136,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry,
return ret;
}
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
/*
* Prove that the dependency graph starting at <src> can not
* lead to <target>. If it can, there is a circle when adding
@@ -2140,7 +2169,10 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
*trace = save_trace();
}
- print_circular_bug(&src_entry, target_entry, src, target);
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
}
return ret;
@@ -2218,6 +2250,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
static inline bool usage_skip(struct lock_list *entry, void *mask)
{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
/*
* Skip local_lock() for irq inversion detection.
*
@@ -2244,14 +2279,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
* As a result, we will skip local_lock(), when we search for irq
* inversion bugs.
*/
- if (entry->class->lock_type == LD_LOCK_PERCPU) {
- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
- return false;
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
- return true;
- }
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
- return false;
+ return true;
}
/*
@@ -2296,7 +2333,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
+ print_lock_name(NULL, class);
#ifdef CONFIG_DEBUG_LOCKDEP
printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
#endif
@@ -2478,11 +2515,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT " --> ");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT " --> ");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT "\n\n");
}
@@ -2490,18 +2527,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2521,6 +2558,8 @@ print_bad_irq_dependency(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=====================================================\n");
pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
@@ -2538,20 +2577,20 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nand this task is already holding:\n");
print_lock(prev);
pr_warn("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
+ print_lock_name(prev, hlock_class(prev));
pr_cont(" ->");
- print_lock_name(hlock_class(next));
+ print_lock_name(next, hlock_class(next));
pr_cont("\n");
pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
+ print_lock_name(NULL, backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
+ print_lock_name(NULL, forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
@@ -2570,11 +2609,13 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
next_root->trace = save_trace();
if (!next_root->trace)
- return;
+ goto out;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
+out:
+ nbcon_cpu_emergency_exit();
}
static const char *state_names[] = {
@@ -2839,6 +2880,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
#ifdef CONFIG_LOCKDEP_SMALL
/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
+}
+
+/*
* Check that the dependency graph starting at <src> can lead to
* <target> or not. If it can, <src> -> <target> dependency is already
* in the graph.
@@ -2921,10 +2997,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
+ __print_lock_name(prv, prev);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
+ __print_lock_name(nxt, next);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
@@ -2934,9 +3010,13 @@ static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("============================================\n");
pr_warn("WARNING: possible recursive locking detected\n");
@@ -2948,12 +3028,19 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
/*
@@ -2969,6 +3056,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
static int
check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -2989,6 +3077,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
if ((next->read == 2) && prev->read)
continue;
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
+
/*
* We're holding the nest_lock, which serializes this lock's
* nesting behaviour.
@@ -3050,6 +3144,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 2;
}
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -3131,19 +3233,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance,
- calc_dep(prev, next),
- *trace);
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance,
- calc_depb(prev, next),
- *trace);
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
@@ -3430,7 +3528,8 @@ static int alloc_chain_hlocks(int req)
size = chain_block_size(curr);
if (likely(size >= req)) {
del_chain_block(0, size, chain_block_next(curr));
- add_chain_block(curr + req, size - req);
+ if (size > req)
+ add_chain_block(curr + req, size - req);
return curr;
}
}
@@ -3462,7 +3561,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
u16 chain_hlock = chain_hlocks[chain->base + i];
unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
- return lock_classes + class_idx - 1;
+ return lock_classes + class_idx;
}
/*
@@ -3530,7 +3629,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
hlock_id = chain_hlocks[chain->base + i];
chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -3539,6 +3638,8 @@ static void print_collision(struct task_struct *curr,
struct held_lock *hlock_next,
struct lock_chain *chain)
{
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("============================\n");
pr_warn("WARNING: chain_key collision\n");
@@ -3555,6 +3656,8 @@ static void print_collision(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
#endif
@@ -3645,8 +3748,10 @@ static inline int add_chain_cache(struct task_struct *curr,
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
chain->chain_key = chain_key;
@@ -3663,8 +3768,10 @@ static inline int add_chain_cache(struct task_struct *curr,
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -3887,11 +3994,11 @@ static void print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -3903,6 +4010,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
if (!debug_locks_off() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("================================\n");
pr_warn("WARNING: inconsistent lock state\n");
@@ -3931,6 +4040,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
/*
@@ -3965,6 +4076,8 @@ print_irq_inversion_bug(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("========================================================\n");
pr_warn("WARNING: possible irq lock inversion dependency detected\n");
@@ -3977,7 +4090,7 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
+ print_lock_name(NULL, other->class);
pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
pr_warn("\nother info that might help us debug this:\n");
@@ -4005,11 +4118,13 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
root->trace = save_trace();
if (!root->trace)
- return;
+ goto out;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
+out:
+ nbcon_cpu_emergency_exit();
}
/*
@@ -4086,6 +4201,8 @@ void print_irqtrace_events(struct task_struct *curr)
{
const struct irqtrace_events *trace = &curr->irqtrace;
+ nbcon_cpu_emergency_enter();
+
printk("irq event stamp: %u\n", trace->irq_events);
printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
@@ -4099,6 +4216,8 @@ void print_irqtrace_events(struct task_struct *curr)
printk("softirqs last disabled at (%u): [<%px>] %pS\n",
trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
(void *)trace->softirq_disable_ip);
+
+ nbcon_cpu_emergency_exit();
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -4234,14 +4353,13 @@ static void __trace_hardirqs_on_caller(void)
/**
* lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip: Caller address
*
* Invoked before a possible transition to RCU idle from exit to user or
* guest mode. This ensures that all RCU operations are done before RCU
* stops watching. After the RCU transition lockdep_hardirqs_on() has to be
* invoked to set the final state.
*/
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
{
if (unlikely(!debug_locks))
return;
@@ -4472,6 +4590,30 @@ void lockdep_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+ pr_warn("CPU %u left hardirqs enabled!", cpu);
+ if (idle)
+ print_irqtrace_events(idle);
+ /* Clean it up for when the CPU comes online again. */
+ per_cpu(hardirqs_enabled, cpu) = 0;
+ }
+}
+
static int
mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
@@ -4501,7 +4643,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4614,10 +4762,12 @@ unlock:
* We must printk outside of the graph_lock:
*/
if (ret == 2) {
+ nbcon_cpu_emergency_enter();
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
print_lock(this);
print_irqtrace_events(curr);
dump_stack();
+ nbcon_cpu_emergency_exit();
}
return ret;
@@ -4658,6 +4808,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
if (debug_locks_silent)
return 0;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=============================\n");
pr_warn("[ BUG: Invalid wait context ]\n");
@@ -4677,6 +4829,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
pr_warn("stack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+
return 0;
}
@@ -4722,7 +4876,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- u8 prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4732,6 +4887,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
@@ -4836,16 +4999,47 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+struct lock_class_key __lockdep_no_track__;
+EXPORT_SYMBOL_GPL(__lockdep_no_track__);
+
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
return;
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("==================================\n");
pr_warn("WARNING: Nested lock was not taken\n");
@@ -4866,6 +5060,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -4881,7 +5077,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4894,8 +5090,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(!debug_locks))
return 0;
- if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return 0;
+
+ lockevent_inc(lockdep_acquire);
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__) {
check = 0;
+ lockevent_inc(lockdep_nocheck);
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES))
+ return 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -4911,11 +5117,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_class_ops_inc(class);
if (very_verbose(class)) {
+ nbcon_cpu_emergency_enter();
printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
}
/*
@@ -4932,7 +5140,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4966,6 +5175,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -5015,7 +5225,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
- print_lock_nested_lock_not_held(curr, hlock, ip);
+ print_lock_nested_lock_not_held(curr, hlock);
return 0;
}
@@ -5027,6 +5237,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -5036,6 +5250,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
@@ -5043,6 +5258,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_print_held_locks(current);
debug_show_all_locks();
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -5062,6 +5278,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=====================================\n");
pr_warn("WARNING: bad unlock balance detected!\n");
@@ -5078,6 +5296,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static noinstr int match_held_lock(const struct held_lock *hlock,
@@ -5168,7 +5388,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5212,9 +5432,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -5406,7 +5627,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -5612,6 +5833,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!debug_locks)
return;
+ /*
+ * As KASAN instrumentation is disabled and lock_acquire() is usually
+ * the first lockdep call when a task tries to acquire a lock, add
+ * kasan_check_byte() here to check for use-after-free and other
+ * memory errors.
+ */
+ kasan_check_byte(lock);
+
if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
@@ -5637,7 +5866,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5649,7 +5878,8 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
trace_lock_release(lock, ip);
- if (unlikely(!lockdep_enabled()))
+ if (unlikely(!lockdep_enabled() ||
+ lock->key == &__lockdep_no_track__))
return;
raw_local_irq_save(flags);
@@ -5663,6 +5893,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
@@ -5752,6 +6010,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=================================\n");
pr_warn("WARNING: bad contention detected!\n");
@@ -5768,6 +6028,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static void
@@ -5787,6 +6049,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, ip);
@@ -5829,6 +6094,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, _RET_IP_);
@@ -5998,8 +6266,13 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
hlist_del_rcu(&class->hash_entry);
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -6011,13 +6284,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -6042,25 +6312,27 @@ static struct pending_free *get_pending_free(void)
static void free_zapped_rcu(struct rcu_head *cb);
/*
- * Schedule an RCU callback if no RCU callback is pending. Must be called with
- * the graph lock held.
- */
-static void call_rcu_zapped(struct pending_free *pf)
+* See if we need to queue an RCU callback, must called with
+* the lockdep lock held, returns false if either we don't have
+* any pending free or the callback is already scheduled.
+* Otherwise, a call_rcu() must follow this function call.
+*/
+static bool prepare_call_rcu_zapped(struct pending_free *pf)
{
WARN_ON_ONCE(inside_selftest());
if (list_empty(&pf->zapped))
- return;
+ return false;
if (delayed_free.scheduled)
- return;
+ return false;
delayed_free.scheduled = true;
WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
delayed_free.index ^= 1;
- call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+ return true;
}
/* The caller must hold the graph lock. May be called from RCU context. */
@@ -6086,6 +6358,7 @@ static void free_zapped_rcu(struct rcu_head *ch)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
return;
@@ -6097,14 +6370,18 @@ static void free_zapped_rcu(struct rcu_head *ch)
pf = delayed_free.pf + (delayed_free.index ^ 1);
__free_zapped_classes(pf);
delayed_free.scheduled = false;
+ need_callback =
+ prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
/*
- * If there's anything on the open list, close and start a new callback.
- */
- call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ * If there's pending free and its callback has not been scheduled,
+ * queue an RCU callback.
+ */
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- lockdep_unlock();
- raw_local_irq_restore(flags);
}
/*
@@ -6144,6 +6421,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
init_data_structures_once();
@@ -6151,10 +6429,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
lockdep_lock();
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
lockdep_unlock();
raw_local_irq_restore(flags);
-
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
@@ -6248,6 +6527,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
struct pending_free *pf;
unsigned long flags;
int locked;
+ bool need_callback = false;
raw_local_irq_save(flags);
locked = graph_lock();
@@ -6256,11 +6536,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
pf = get_pending_free();
__lockdep_reset_lock(pf, lock);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
graph_unlock();
out_irq:
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
}
/*
@@ -6290,7 +6572,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -6298,6 +6586,7 @@ void lockdep_unregister_key(struct lock_class_key *key)
struct pending_free *pf;
unsigned long flags;
bool found = false;
+ bool need_callback = false;
might_sleep();
@@ -6305,10 +6594,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -6316,31 +6603,45 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ need_callback = prepare_call_rcu_zapped(pf);
+ nr_dynamic_keys--;
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
- /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
- synchronize_rcu();
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
+ /*
+ * Wait until is_dynamic_key() has finished accessing k->hash_entry.
+ *
+ * Some operations like __qdisc_destroy() will call this in a debug
+ * kernel, and the network traffic is disabled while waiting, hence
+ * the delay of the wait matters in debugging cases. Currently use a
+ * synchronize_rcu_expedited() to speed up the wait at the cost of
+ * system IPIs. TODO: Replace RCU with hazptr for this.
+ */
+ synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(lockdep_unregister_key);
void __init lockdep_init(void)
{
- printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+ pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
- printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
- printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
- printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
- printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
- printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
- printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
- printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+ pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
- printk(" memory used by lock dependency info: %zu kB\n",
+ pr_info(" memory used by lock dependency info: %zu kB\n",
(sizeof(lock_classes) +
sizeof(lock_classes_in_use) +
sizeof(classhash_table) +
@@ -6358,12 +6659,12 @@ void __init lockdep_init(void)
);
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
- printk(" memory used for stack traces: %zu kB\n",
+ pr_info(" memory used for stack traces: %zu kB\n",
(sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
);
#endif
- printk(" per task-struct memory footprint: %zu bytes\n",
+ pr_info(" per task-struct memory footprint: %zu bytes\n",
sizeof(((struct task_struct *)NULL)->held_locks));
}
@@ -6376,6 +6677,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=========================\n");
pr_warn("WARNING: held lock freed!\n");
@@ -6388,6 +6691,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static inline int not_in_range(const void* mem_from, unsigned long mem_len,
@@ -6434,6 +6739,8 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("====================================\n");
pr_warn("WARNING: %s/%d still has locks held!\n",
@@ -6443,6 +6750,8 @@ static void print_held_locks_bug(void)
lockdep_print_held_locks(current);
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
void debug_check_no_locks_held(void)
@@ -6500,6 +6809,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("================================================\n");
pr_warn("WARNING: lock held when returning to user space!\n");
@@ -6508,6 +6818,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
+ nbcon_cpu_emergency_exit();
}
/*
@@ -6521,8 +6832,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=============================\n");
pr_warn("WARNING: suspicious RCU usage\n");
@@ -6539,7 +6852,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -6561,5 +6874,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index ecb8662e7a4e..0e5e6ffe91a3 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -47,29 +47,31 @@ enum {
__LOCKF(USED_READ)
};
+enum {
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
-static const unsigned long LOCKF_ENABLED_IRQ =
+ LOCKF_ENABLED_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
-static const unsigned long LOCKF_USED_IN_IRQ =
+ LOCKF_USED_IN_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
-static const unsigned long LOCKF_ENABLED_IRQ_READ =
+ LOCKF_ENABLED_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
-static const unsigned long LOCKF_USED_IN_IRQ_READ =
+ LOCKF_USED_IN_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
+};
#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
@@ -119,9 +121,9 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+#define AVG_LOCKDEP_CHAIN_DEPTH 5
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -138,6 +140,7 @@ extern unsigned long nr_lock_classes;
extern unsigned long nr_zapped_classes;
extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries;
+extern unsigned long nr_dynamic_keys;
long lockdep_next_lockchain(long i);
unsigned long lock_chain_count(void);
extern unsigned long nr_stack_trace_entries;
@@ -151,6 +154,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +212,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b8d9a050c337..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -261,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " dynamic-keys: %11lu\n",
+ nr_dynamic_keys);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n",
@@ -345,6 +372,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -397,7 +426,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
+ seq_putc(m, c);
seq_puts(m, "\n");
}
@@ -413,7 +442,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
@@ -622,12 +651,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
- iter->stats = lock_stats(class);
+ lock_stats(class, &iter->stats);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -645,6 +678,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -654,8 +688,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9c2fb613a55d..6567e5eeacc0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -30,30 +30,84 @@
#include <linux/torture.h>
#include <linux/reboot.h>
+MODULE_DESCRIPTION("torture test facility for locking");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0444);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0444);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
@@ -67,6 +121,12 @@ struct lock_stress_stats {
long n_lock_acquired;
};
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
+
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -76,10 +136,12 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(int tid);
@@ -112,12 +174,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -127,15 +186,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused)
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -155,16 +249,17 @@ __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
+ j = jiffies;
+ mdelay(long_hold);
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -179,7 +274,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -206,7 +301,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -214,6 +309,113 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+
+#include <asm/rqspinlock.h>
+static rqspinlock_t rqspinlock;
+
+static int torture_raw_res_spin_write_lock(int tid __maybe_unused)
+{
+ raw_res_spin_lock(&rqspinlock);
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock(int tid __maybe_unused)
+{
+ raw_res_spin_unlock(&rqspinlock);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_ops = {
+ .writelock = torture_raw_res_spin_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock"
+};
+
+static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused)
+{
+ unsigned long flags;
+
+ raw_res_spin_lock_irqsave(&rqspinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused)
+{
+ raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_irq_ops = {
+ .writelock = torture_raw_res_spin_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock_irq"
+};
+
+#endif
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -226,14 +428,12 @@ __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
@@ -254,14 +454,12 @@ __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
@@ -275,7 +473,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -318,7 +516,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -327,6 +525,28 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
static int torture_mutex_lock(int tid __maybe_unused)
__acquires(torture_mutex)
@@ -337,14 +557,9 @@ __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -355,11 +570,24 @@ __releases(torture_mutex)
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -456,7 +684,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
.exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -466,59 +694,48 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
-static int torture_rtmutex_lock(int tid __maybe_unused)
-__acquires(torture_rtmutex)
+static void torture_rtmutex_init(void)
{
- rt_mutex_lock(&torture_rtmutex);
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
{
- const unsigned int factor = 50000; /* yes, quite arbitrary */
+ int i;
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- sched_set_fifo(current);
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- sched_set_normal(current, 0);
- } else /* common case, do nothing */
- return;
- }
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
+
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -530,11 +747,32 @@ __releases(torture_rtmutex)
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -552,14 +790,9 @@ __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -579,14 +812,11 @@ __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -600,7 +830,7 @@ __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -652,7 +882,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
.exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -666,30 +896,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
- int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
+ bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
+
cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock(tid);
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
- lwsp->n_lock_fail++; /* rare, but... */
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
+ lwsp->n_lock_acquired++;
- lwsp->n_lock_acquired++;
- cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock(tid);
+ cxt.cur_ops->write_delay(&rand);
+
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -825,16 +1088,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -856,8 +1172,7 @@ static void lock_torture_cleanup(void)
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -888,12 +1203,18 @@ static void lock_torture_cleanup(void)
kfree(cxt.lrsa);
cxt.lrsa = NULL;
+ call_rcu_chain_cleanup();
+
end:
if (cxt.init_called) {
if (cxt.cur_ops->exit)
cxt.cur_ops->exit();
cxt.init_called = false;
}
+
+ free_cpumask_var(bind_readers);
+ free_cpumask_var(bind_writers);
+
torture_cleanup_end();
}
@@ -904,6 +1225,10 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
+#ifdef CONFIG_BPF_SYSCALL
+ &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops,
+#endif
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -1016,6 +1341,10 @@ static int __init lock_torture_init(void)
}
}
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
@@ -1053,6 +1382,10 @@ static int __init lock_torture_init(void)
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
@@ -1080,10 +1413,13 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1093,6 +1429,8 @@ static int __init lock_torture_init(void)
reader_tasks[j]);
if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 85251d8771d9..5c92ba199b90 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -15,12 +15,6 @@
#include <asm/mcs_spinlock.h>
-struct mcs_spinlock {
- struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
- int count; /* nesting count, see qspinlock.c */
-};
-
#ifndef arch_mcs_spin_lock_contended
/*
* Using smp_cond_load_acquire() provides the acquire semantics
@@ -30,9 +24,7 @@ struct mcs_spinlock {
* spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
-do { \
- smp_cond_load_acquire(l, VAL); \
-} while (0)
+ smp_cond_load_acquire(l, VAL)
#endif
#ifndef arch_mcs_spin_unlock_contended
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index bc8abb8549d2..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -12,6 +12,7 @@
*/
#include <linux/mutex.h>
#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/export.h>
#include <linux/poison.h>
#include <linux/sched.h>
@@ -52,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
{
lockdep_assert_held(&lock->wait_lock);
- /* Mark the current thread as blocked on the lock: */
- task->blocked_on = waiter;
+ /* Current thread can't be already blocked (since it's executing!) */
+ DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
}
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
+ struct mutex *blocked_on = __get_task_blocked_on(task);
+
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != task);
- DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
- task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
@@ -76,19 +78,22 @@ void debug_mutex_unlock(struct mutex *lock)
}
}
-void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
-#endif
lock->magic = lock;
}
+static void devm_mutex_release(void *res)
+{
+ mutex_destroy(res);
+}
+
+int __devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+ return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+EXPORT_SYMBOL_GPL(__devm_mutex_init);
+
/***
* mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5e3585950ec8..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -29,6 +29,10 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#include <linux/hung_task.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
#ifndef CONFIG_PREEMPT_RT
#include "mutex.h"
@@ -39,8 +43,7 @@
# define MUTEX_WARN_ON(cond)
#endif
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
{
atomic_long_set(&lock->owner, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -48,34 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
-
- debug_mutex_init(lock, name, key);
-}
-EXPORT_SYMBOL(__mutex_init);
-
-/*
- * @owner: contains: 'struct task_struct *' to the current lock owner,
- * NULL means not owned. Since task_struct pointers are aligned at
- * at least L1_CACHE_BYTES, we have low bits to store extra state.
- *
- * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
- * Bit1 indicates unlock needs to hand the lock to the top-waiter
- * Bit2 indicates handoff has been done and we're waiting for pickup.
- */
-#define MUTEX_FLAG_WAITERS 0x01
-#define MUTEX_FLAG_HANDOFF 0x02
-#define MUTEX_FLAG_PICKUP 0x04
-
-#define MUTEX_FLAGS 0x07
-
-/*
- * Internal helper function; C doesn't allow us to hide it :/
- *
- * DO NOT USE (outside of mutex code).
- */
-static inline struct task_struct *__mutex_owner(struct mutex *lock)
-{
- return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
+ debug_mutex_init(lock);
}
static inline struct task_struct *__owner_task(unsigned long owner)
@@ -94,6 +70,14 @@ static inline unsigned long __owner_flags(unsigned long owner)
return owner & MUTEX_FLAGS;
}
+/* Do not use the return value as a pointer directly. */
+unsigned long mutex_get_owner(struct mutex *lock)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ return (unsigned long)__owner_task(owner);
+}
+
/*
* Returns: __mutex_owner(lock) on failure or NULL on success.
*/
@@ -155,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
* There is nothing that would stop spreading the lockdep annotations outwards
* except more code.
*/
+void mutex_init_generic(struct mutex *lock)
+{
+ __mutex_init_generic(lock);
+}
+EXPORT_SYMBOL(mutex_init_generic);
/*
* Optimistic trylock that only works in the uncontended case. Make sure to
@@ -165,6 +154,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
unsigned long curr = (unsigned long)current;
unsigned long zero = 0UL;
+ MUTEX_WARN_ON(lock->magic != lock);
+
if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
@@ -177,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
-#endif
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ __mutex_init_generic(lock);
+
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
{
@@ -202,6 +207,7 @@ static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
+ hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
debug_mutex_add_waiter(lock, waiter, current);
list_add_tail(&waiter->list, list);
@@ -217,6 +223,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
__mutex_clear_flag(lock, MUTEX_FLAGS);
debug_mutex_remove_waiter(lock, waiter, current);
+ hung_task_clear_blocker();
}
/*
@@ -529,6 +536,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
@@ -567,8 +579,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
+ DEFINE_WAKE_Q(wake_q);
struct mutex_waiter waiter;
struct ww_mutex *ww;
+ unsigned long flags;
int ret;
if (!use_ww_ctx)
@@ -599,23 +613,25 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/*
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
if (ww_ctx)
- __ww_mutex_check_waiters(lock, ww_ctx);
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
goto skip_wait;
}
@@ -635,12 +651,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* Add in stamp order, waking up waiters that must kill
* themselves.
*/
- ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
if (ret)
goto err_early_kill;
}
+ __set_task_blocked_on(current, lock);
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
bool first;
@@ -669,25 +687,46 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+
schedule_preempt_disabled();
first = __mutex_waiter_is_first(lock, &waiter);
+ /*
+ * As we likely have been woken up by task
+ * that has cleared our blocked_on state, re-set
+ * it to the lock we are trying to acquire.
+ */
+ set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock_or_handoff(lock, first) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
- raw_spin_lock(&lock->wait_lock);
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ /*
+ * mutex_optimistic_spin() can call schedule(), so
+ * clear blocked on so we don't become unselectable
+ * to run.
+ */
+ clear_task_blocked_on(current, lock);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ set_task_blocked_on(current, lock);
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
acquired:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
if (ww_ctx) {
@@ -697,7 +736,7 @@ acquired:
*/
if (!ww_ctx->is_wait_die &&
!__mutex_waiter_is_first(lock, &waiter))
- __ww_mutex_check_waiters(lock, ww_ctx);
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
}
__mutex_remove_waiter(lock, &waiter);
@@ -707,19 +746,23 @@ acquired:
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
err:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- raw_spin_unlock(&lock->wait_lock);
+ WARN_ON(__get_task_blocked_on(current));
+ trace_contention_end(lock, ret);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
@@ -795,11 +838,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
int __sched
-mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+_mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest)
{
- return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
@@ -889,6 +933,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
struct task_struct *next = NULL;
DEFINE_WAKE_Q(wake_q);
unsigned long owner;
+ unsigned long flags;
mutex_release(&lock->dep_map, ip);
@@ -915,7 +960,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
}
}
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -926,15 +971,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
+ __clear_task_blocked_on(next, lock);
wake_q_add(&wake_q, next);
}
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- raw_spin_unlock(&lock->wait_lock);
-
- wake_up_q(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -1050,6 +1094,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
#endif
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
/**
* mutex_trylock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
@@ -1066,17 +1111,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
*/
int __sched mutex_trylock(struct mutex *lock)
{
+ MUTEX_WARN_ON(lock->magic != lock);
+ return __mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#else
+int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock)
+{
bool locked;
MUTEX_WARN_ON(lock->magic != lock);
-
locked = __mutex_trylock(lock);
if (locked)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_trylock_nest_lock);
+#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
@@ -1112,6 +1164,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#endif /* !CONFIG_PREEMPT_RT */
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
* @cnt: the atomic which we are to dec
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 0b2a79c4013b..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -6,7 +6,7 @@
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
-
+#ifndef CONFIG_PREEMPT_RT
/*
* This is the control structure for tasks blocked on mutex, which resides
* on the blocked task's kernel stack:
@@ -20,6 +20,33 @@ struct mutex_waiter {
#endif
};
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
+ */
+#define MUTEX_FLAG_WAITERS 0x01
+#define MUTEX_FLAG_HANDOFF 0x02
+#define MUTEX_FLAG_PICKUP 0x04
+
+#define MUTEX_FLAGS 0x07
+
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex & scheduler code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
+{
+ if (!lock)
+ return NULL;
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
+}
+
#ifdef CONFIG_DEBUG_MUTEXES
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
@@ -32,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock,
extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+extern void debug_mutex_init(struct mutex *lock);
#else /* CONFIG_DEBUG_MUTEXES */
# define debug_mutex_lock_common(lock, waiter) do { } while (0)
# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
@@ -41,5 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_unlock(lock) do { } while (0)
-# define debug_mutex_init(lock, name, key) do { } while (0)
+# define debug_mutex_init(lock) do { } while (0)
#endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..b4233dc2c2b0 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -212,8 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Fast path for the uncontended case.
*/
- if (likely(atomic_cmpxchg_release(&lock->tail, curr,
- OSQ_UNLOCKED_VAL) == curr))
+ if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL))
return;
/*
@@ -226,7 +228,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..ef234469baac 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,7 +7,9 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -136,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
return !reader; /* wake (readers until) 1 writer */
}
-static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader,
+ bool freeze)
{
DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
bool wait;
@@ -154,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
spin_unlock_irq(&sem->waiters.lock);
while (wait) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE |
+ (freeze ? TASK_FREEZABLE : 0));
if (!smp_load_acquire(&wq_entry.private))
break;
schedule();
@@ -162,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try,
+ bool freeze)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -170,9 +175,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
- percpu_rwsem_wait(sem, /* .reader = */ true);
+ percpu_rwsem_wait(sem, /* .reader = */ true, freeze);
preempt_disable();
+ trace_contention_end(sem, 0);
return true;
}
@@ -180,7 +187,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
#define per_cpu_sum(var) \
({ \
- typeof(var) __sum = 0; \
+ TYPEOF_UNQUAL(var) __sum = 0; \
int cpu; \
compiletime_assert_atomic_type(__sum); \
for_each_possible_cpu(cpu) \
@@ -188,6 +195,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
@@ -211,8 +224,10 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ bool contended = false;
+
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -223,8 +238,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* Try set sem->block; this provides writer-writer exclusion.
* Having sem->block set makes new readers block.
*/
- if (!__percpu_down_write_trylock(sem))
- percpu_rwsem_wait(sem, /* .reader = */ false);
+ if (!__percpu_down_write_trylock(sem)) {
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
+ percpu_rwsem_wait(sem, /* .reader = */ false, false);
+ contended = true;
+ }
/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
@@ -236,6 +254,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ if (contended)
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73f4733..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,12 +12,13 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..af8d122bb649 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,10 +22,12 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
/*
- * Include queued spinlock statistics code
+ * Include queued spinlock definitions and statistics code
*/
+#include "qspinlock.h"
#include "qspinlock_stat.h"
/*
@@ -66,36 +68,6 @@
*/
#include "mcs_spinlock.h"
-#define MAX_NODES 4
-
-/*
- * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
- * size and four of them will fit nicely in one 64-byte cacheline. For
- * pvqspinlock, however, we need more space for extra data. To accommodate
- * that, we insert two more long words to pad it up to 32 bytes. IOW, only
- * two of them can fit in a cacheline in this case. That is OK as it is rare
- * to have more than 2 levels of slowpath nesting in actual use. We don't
- * want to penalize pvqspinlocks to optimize for a rare case in native
- * qspinlocks.
- */
-struct qnode {
- struct mcs_spinlock mcs;
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- long reserved[2];
-#endif
-};
-
-/*
- * The pending bit spinning loop count.
- * This heuristic is used to limit the number of lockword accesses
- * made by atomic_cond_read_relaxed when waiting for the lock to
- * transition out of the "== _Q_PENDING_VAL" state. We don't spin
- * indefinitely because there's no guarantee that we'll make forward
- * progress.
- */
-#ifndef _Q_PENDING_LOOPS
-#define _Q_PENDING_LOOPS 1
-#endif
/*
* Per-CPU queue node structures; we can never have more than 4 nested
@@ -105,164 +77,7 @@ struct qnode {
*
* PV doubles the storage and uses the second cacheline for PV state.
*/
-static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
-
-/*
- * We must be able to distinguish between no-tail and the tail at 0:0,
- * therefore increment the cpu number by one.
- */
-
-static inline __pure u32 encode_tail(int cpu, int idx)
-{
- u32 tail;
-
- tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
- tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
-
- return tail;
-}
-
-static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
-{
- int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
- int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-
- return per_cpu_ptr(&qnodes[idx].mcs, cpu);
-}
-
-static inline __pure
-struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
-{
- return &((struct qnode *)base + idx)->mcs;
-}
-
-#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-
-#if _Q_PENDING_BITS == 8
-/**
- * clear_pending - clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,* -> *,0,*
- */
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->pending, 0);
-}
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- *
- * Lock stealing is not allowed if this function is used.
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
-}
-
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail), which heads an address dependency
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- /*
- * We can use relaxed semantics since the caller ensures that the
- * MCS node is properly initialized before updating the tail.
- */
- return (u32)xchg_relaxed(&lock->tail,
- tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
-#else /* _Q_PENDING_BITS == 8 */
-
-/**
- * clear_pending - clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,* -> *,0,*
- */
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
-}
-
-/**
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail)
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- u32 old, new, val = atomic_read(&lock->val);
-
- for (;;) {
- new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- /*
- * We can use relaxed semantics since the caller ensures that
- * the MCS node is properly initialized before updating the
- * tail.
- */
- old = atomic_cmpxchg_relaxed(&lock->val, val, new);
- if (old == val)
- break;
-
- val = old;
- }
- return old;
-}
-#endif /* _Q_PENDING_BITS == 8 */
-
-/**
- * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
- * @lock : Pointer to queued spinlock structure
- * Return: The previous lock value
- *
- * *,*,* -> *,1,*
- */
-#ifndef queued_fetch_set_pending_acquire
-static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
-{
- return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
-}
-#endif
-
-/**
- * set_locked - Set the lock bit and own the lock
- * @lock: Pointer to queued spinlock structure
- *
- * *,*,0 -> *,0,1
- */
-static __always_inline void set_locked(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
-}
-
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
/*
* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
@@ -312,7 +127,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 old, tail;
@@ -370,7 +185,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -379,7 +194,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
@@ -401,6 +216,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -410,7 +227,7 @@ pv_queue:
* any MCS node. This is not the most elegant solution, but is
* simple enough.
*/
- if (unlikely(idx >= MAX_NODES)) {
+ if (unlikely(idx >= _Q_MAX_NODES)) {
lockevent_inc(lock_no_node);
while (!queued_spin_trylock(lock))
cpu_relax();
@@ -465,7 +282,7 @@ pv_queue:
* head of the waitqueue.
*/
if (old & _Q_TAIL_MASK) {
- prev = decode_tail(old);
+ prev = decode_tail(old, qnodes);
/* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);
@@ -554,6 +371,8 @@ locked:
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
@@ -581,7 +400,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
-bool nopvspin __initdata;
+bool nopvspin;
static __init int parse_nopvspin(char *arg)
{
nopvspin = true;
diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h
new file mode 100644
index 000000000000..d69958a844f7
--- /dev/null
+++ b/kernel/locking/qspinlock.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Queued spinlock defines
+ *
+ * This file contains macro definitions and functions shared between different
+ * qspinlock slow path implementations.
+ */
+#ifndef __LINUX_QSPINLOCK_H
+#define __LINUX_QSPINLOCK_H
+
+#include <asm-generic/percpu.h>
+#include <linux/percpu-defs.h>
+#include <asm-generic/qspinlock.h>
+#include <asm-generic/mcs_spinlock.h>
+
+#define _Q_MAX_NODES 4
+
+/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+ struct mcs_spinlock mcs;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ long reserved[2];
+#endif
+};
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline __pure u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail,
+ struct qnode __percpu *qnodes)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+ return &((struct qnode *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ /*
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
+ */
+ return (u32)xchg_relaxed(&lock->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+ /*
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
+ */
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+#endif /* __LINUX_QSPINLOCK_H */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..dc1cb90e3644 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -38,13 +38,13 @@
#define PV_PREV_CHECK_MASK 0xff
/*
- * Queue node uses: vcpu_running & vcpu_halted.
- * Queue head uses: vcpu_running & vcpu_hashed.
+ * Queue node uses: VCPU_RUNNING & VCPU_HALTED.
+ * Queue head uses: VCPU_RUNNING & VCPU_HASHED.
*/
enum vcpu_state {
- vcpu_running = 0,
- vcpu_halted, /* Used only in pv_wait_node */
- vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
+ VCPU_RUNNING = 0,
+ VCPU_HALTED, /* Used only in pv_wait_node */
+ VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */
};
struct pv_node {
@@ -86,9 +86,10 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
*/
for (;;) {
int val = atomic_read(&lock->val);
+ u8 old = 0;
if (!(val & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) {
lockevent_inc(pv_lock_stealing);
return true;
}
@@ -116,11 +117,12 @@ static __always_inline void set_pending(struct qspinlock *lock)
* barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
* lock just to be sure that it will get it.
*/
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
+ u16 old = _Q_PENDING_VAL;
+
return !READ_ONCE(lock->locked) &&
- (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
- _Q_LOCKED_VAL) == _Q_PENDING_VAL);
+ try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -128,27 +130,21 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
- int val = atomic_read(&lock->val);
-
- for (;;) {
- int old, new;
-
- if (val & _Q_LOCKED_MASK)
- break;
+ int old, new;
+ old = atomic_read(&lock->val);
+ do {
+ if (old & _Q_LOCKED_MASK)
+ return false;
/*
* Try to clear pending bit & set locked bit
*/
- old = val;
- new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg_acquire(&lock->val, old, new);
+ new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new));
- if (val == old)
- return 1;
- }
- return 0;
+ return true;
}
#endif /* _Q_PENDING_BITS == 8 */
@@ -216,8 +212,9 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *old = NULL;
hopcnt++;
- if (!cmpxchg(&he->lock, NULL, lock)) {
+ if (try_cmpxchg(&he->lock, &old, lock)) {
WRITE_ONCE(he->node, node);
lockevent_pv_hop(hopcnt);
return &he->lock;
@@ -269,7 +266,7 @@ pv_wait_early(struct pv_node *prev, int loop)
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
- return READ_ONCE(prev->state) != vcpu_running;
+ return READ_ONCE(prev->state) != VCPU_RUNNING;
}
/*
@@ -282,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node)
BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
pn->cpu = smp_processor_id();
- pn->state = vcpu_running;
+ pn->state = VCPU_RUNNING;
}
/*
@@ -294,8 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- int loop;
bool wait_early;
+ int loop;
for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -311,26 +308,26 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
/*
* Order pn->state vs pn->locked thusly:
*
- * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * [S] pn->state = VCPU_HALTED [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_hashed
+ * [L] pn->locked [RmW] pn->state = VCPU_HASHED
*
* Matches the cmpxchg() from pv_kick_node().
*/
- smp_store_mb(pn->state, vcpu_halted);
+ smp_store_mb(pn->state, VCPU_HALTED);
if (!READ_ONCE(node->locked)) {
lockevent_inc(pv_wait_node);
lockevent_cond_inc(pv_wait_early, wait_early);
- pv_wait(&pn->state, vcpu_halted);
+ pv_wait(&pn->state, VCPU_HALTED);
}
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * If pv_kick_node() changed us to VCPU_HASHED, retain that
* value so that pv_wait_head_or_lock() knows to not also try
* to hash this lock.
*/
- cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+ cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -360,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
-
+ u8 old = VCPU_HALTED;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
@@ -377,8 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* subsequent writes.
*/
smp_mb__before_atomic();
- if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
- != vcpu_halted)
+ if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED))
return;
/*
@@ -411,7 +407,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
* If pv_kick_node() already advanced our state, we don't need to
* insert ourselves into the hash table anymore.
*/
- if (READ_ONCE(pn->state) == vcpu_hashed)
+ if (READ_ONCE(pn->state) == VCPU_HASHED)
lp = (struct qspinlock **)1;
/*
@@ -424,7 +420,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
* Set correct vCPU state to be used by queue node wait-early
* mechanism.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ WRITE_ONCE(pn->state, VCPU_RUNNING);
/*
* Set the pending bit in the active lock spinning loop to
@@ -464,7 +460,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_hashed);
+ WRITE_ONCE(pn->state, VCPU_HASHED);
lockevent_inc(pv_wait_head);
lockevent_cond_inc(pv_wait_again, waitcnt);
pv_wait(&lock->locked, _Q_SLOW_VAL);
@@ -486,10 +482,20 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct pv_node *node;
@@ -533,28 +539,17 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- u8 locked;
+ u8 locked = _Q_LOCKED_VAL;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
+ if (try_cmpxchg_release(&lock->locked, &locked, 0))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8555c4efe97c..c80902eacd79 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,7 +24,10 @@
#include <linux/sched/wake_q.h>
#include <linux/ww_mutex.h>
+#include <trace/events/lock.h>
+
#include "rtmutex_common.h"
+#include "lock_events.h"
#ifndef WW_RT
# define build_ww_mutex() (false)
@@ -32,13 +35,15 @@
static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
struct rt_mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
return 0;
}
static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
}
@@ -87,15 +92,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
* set this bit before looking at the lock.
*/
-static __always_inline void
-rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
@@ -104,7 +125,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -170,8 +192,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -186,6 +221,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
return try_cmpxchg_acquire(&lock->owner, &old, new);
}
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -200,12 +240,20 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
*/
static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
+ unsigned long *p = (unsigned long *) &lock->owner;
+ unsigned long owner, new;
+ owner = READ_ONCE(*p);
do {
- owner = *p;
- } while (cmpxchg_relaxed(p, owner,
- owner | RT_MUTEX_HAS_WAITERS) != owner);
+ new = owner | RT_MUTEX_HAS_WAITERS;
+ } while (!try_cmpxchg_relaxed(p, &owner, new));
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -258,6 +306,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
}
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -288,27 +350,49 @@ static __always_inline int __waiter_prio(struct task_struct *task)
{
int prio = task->prio;
- if (!rt_prio(prio))
+ if (!rt_or_dl_prio(prio))
return DEFAULT_PRIO;
return prio;
}
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
static __always_inline void
waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
{
- waiter->prio = __waiter_prio(task);
- waiter->deadline = task->dl.deadline;
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
}
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
*/
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -325,8 +409,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -346,7 +430,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
struct rt_mutex_waiter *top_waiter)
{
- if (rt_mutex_waiter_less(waiter, top_waiter))
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
return true;
#ifdef RT_MUTEX_BUILD_SPINLOCKS
@@ -354,30 +438,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
- if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ if (rt_or_dl_prio(waiter->tree.prio))
return false;
- return rt_mutex_waiter_equal(waiter, top_waiter);
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
#else
return false;
#endif
}
#define __node_2_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, tree_entry)
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
struct rt_mutex_waiter *aw = __node_2_waiter(a);
struct rt_mutex_waiter *bw = __node_2_waiter(b);
- if (rt_mutex_waiter_less(aw, bw))
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
return 1;
if (!build_ww_mutex())
return 0;
- if (rt_mutex_waiter_less(bw, aw))
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
return 0;
/* NOTE: relies on waiter->ww_ctx being set before insertion */
@@ -395,48 +479,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
static __always_inline void
rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
}
static __always_inline void
rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
}
-#define __node_2_pi_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
-static __always_inline bool
-__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
{
- return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
+ lockdep_assert_held(&task->pi_lock);
+
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
return;
- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -532,9 +626,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -549,27 +648,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
enum rtmutex_chainwalk chwalk,
@@ -717,7 +821,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -725,13 +829,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -835,17 +944,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -862,13 +972,19 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_state(waiter->task, waiter->wake_state);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
@@ -881,8 +997,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -897,8 +1014,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -1086,7 +1204,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
struct ww_acquire_ctx *ww_ctx,
- enum rtmutex_chainwalk chwalk)
+ enum rtmutex_chainwalk chwalk,
+ struct wake_q_head *wake_q)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
@@ -1114,6 +1233,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
waiter->task = task;
waiter->lock = lock;
waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -1129,7 +1249,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
/* Check whether the waiter should back out immediately */
rtm = container_of(lock, struct rt_mutex, rtmutex);
- res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
if (res) {
raw_spin_lock(&task->pi_lock);
rt_mutex_dequeue(lock, waiter);
@@ -1147,7 +1267,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1173,7 +1293,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1194,6 +1314,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1206,7 +1328,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1241,7 +1363,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the lock waiters bit
* unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1442,7 +1564,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1475,6 +1597,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
@@ -1482,16 +1605,21 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
struct task_struct *owner;
int ret = 0;
+ lockevent_inc(rtmutex_slow_block);
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
+ if (try_to_take_rt_mutex(lock, current, waiter)) {
+ lockevent_inc(rtmutex_slow_acq3);
break;
+ }
if (timeout && !timeout->task) {
ret = -ETIMEDOUT;
@@ -1512,10 +1640,12 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
- schedule();
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) {
+ lockevent_inc(rtmutex_slow_sleep);
+ rt_mutex_schedule();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1526,6 +1656,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
@@ -1538,13 +1669,13 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ rt_mutex_schedule();
}
}
@@ -1555,58 +1686,70 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
* @state: The task state for sleeping
* @chwalk: Indicator whether full or partial chainwalk is requested
* @waiter: Initializer waiter for blocking
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
struct ww_mutex *ww = ww_container_of(rtm);
int ret;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtmutex_slowlock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
if (build_ww_mutex() && ww_ctx) {
- __ww_mutex_check_waiters(rtm, ww_ctx);
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq1);
return 0;
}
set_current_state(state);
- ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
+ trace_contention_begin(lock, LCB_F_RT);
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
if (likely(!ret))
- ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
if (likely(!ret)) {
/* acquired the lock */
if (build_ww_mutex() && ww_ctx) {
if (!ww_ctx->is_wait_die)
- __ww_mutex_check_waiters(rtm, ww_ctx);
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq2);
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
+ lockevent_inc(rtmutex_deadlock);
}
/*
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
+
+ trace_contention_end(lock, ret);
+
return ret;
}
static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
- unsigned int state)
+ unsigned int state,
+ struct wake_q_head *wake_q)
{
struct rt_mutex_waiter waiter;
int ret;
@@ -1615,9 +1758,10 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
waiter.ww_ctx = ww_ctx;
ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
- &waiter);
+ &waiter, wake_q);
debug_rt_mutex_free_waiter(&waiter);
+ lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q));
return ret;
}
@@ -1631,10 +1775,20 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state)
{
+ DEFINE_WAKE_Q(wake_q);
unsigned long flags;
int ret;
/*
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
+ */
+ rt_mutex_pre_schedule();
+
+ /*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled
* (debug, no architecture support). In this case we will acquire the
@@ -1643,8 +1797,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
* irqsave/restore variants.
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+ rt_mutex_post_schedule();
return ret;
}
@@ -1652,7 +1807,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
unsigned int state)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (likely(rt_mutex_try_acquire(lock)))
return 0;
return rt_mutex_slowlock(lock, NULL, state);
@@ -1667,37 +1824,49 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
/**
* rtlock_slowlock_locked - Slow path lock acquisition for RT locks
* @lock: The underlying RT mutex
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
-static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
struct rt_mutex_waiter waiter;
struct task_struct *owner;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtlock_slowlock);
- if (try_to_take_rt_mutex(lock, current, NULL))
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ lockevent_inc(rtlock_slow_acq1);
return;
+ }
rt_mutex_init_rtlock_waiter(&waiter);
/* Save current state and set state to TASK_RTLOCK_WAIT */
current_save_and_set_rtlock_wait_state();
- task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
+ trace_contention_begin(lock, LCB_F_RT);
+
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q);
for (;;) {
/* Try to acquire the lock again */
- if (try_to_take_rt_mutex(lock, current, &waiter))
+ if (try_to_take_rt_mutex(lock, current, &waiter)) {
+ lockevent_inc(rtlock_slow_acq2);
break;
+ }
if (&waiter == rt_mutex_top_waiter(lock))
owner = rt_mutex_owner(lock);
else
owner = NULL;
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) {
+ lockevent_inc(rtlock_slow_sleep);
schedule_rtlock();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_RTLOCK_WAIT);
@@ -1710,17 +1879,21 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the waiter bit unconditionally.
* We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
debug_rt_mutex_free_waiter(&waiter);
+
+ trace_contention_end(lock, 0);
+ lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q));
}
static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- rtlock_slowlock_locked(lock);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 900220941caa..59dbd29cb219 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -13,6 +13,24 @@
*/
int max_lock_depth = 1024;
+static const struct ctl_table rtmutex_sysctl_table[] = {
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+ register_sysctl_init("kernel", rtmutex_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
/*
* Debug aware fast / slowpath lock,trylock,unlock
*
@@ -175,10 +193,10 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
}
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * mark_wakeup_next_waiter() deboosts and retains preemption
+ * disabled when dropping the wait_lock, to avoid inversion prior
+ * to the wakeup. preempt_disable() therein pairs with the
+ * preempt_enable() in rt_mutex_postunlock().
*/
mark_wakeup_next_waiter(wqh, lock);
@@ -267,7 +285,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
{
debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
+ rt_mutex_clear_owner(lock);
}
/**
@@ -275,6 +293,7 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
* @lock: the rt_mutex to take
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
+ * @wake_q: the wake_q to wake tasks after we release the wait_lock
*
* Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
* detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
@@ -291,7 +310,8 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
*/
int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task)
+ struct task_struct *task,
+ struct wake_q_head *wake_q)
{
int ret;
@@ -302,7 +322,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
- RT_MUTEX_FULL_CHAINWALK);
+ RT_MUTEX_FULL_CHAINWALK, wake_q);
if (ret && !rt_mutex_owner(lock)) {
/*
@@ -341,12 +361,16 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct task_struct *task)
{
int ret;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q);
if (unlikely(ret))
remove_waiter(lock, waiter);
+ preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
return ret;
}
@@ -377,12 +401,12 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
- ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -438,7 +462,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, false);
raw_spin_unlock_irq(&lock->wait_lock);
@@ -459,7 +483,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -491,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task)
#ifdef CONFIG_PREEMPT_RT
/* Mutexes */
-void __mutex_rt_init(struct mutex *mutex, const char *name,
- struct lock_class_key *key)
+static void __mutex_rt_init_generic(struct mutex *mutex)
{
+ rt_mutex_base_init(&mutex->rtmutex);
debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
- lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
}
-EXPORT_SYMBOL(__mutex_rt_init);
static __always_inline int __mutex_lock_common(struct mutex *lock,
unsigned int state,
@@ -518,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+ __mutex_rt_init_generic(mutex);
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
@@ -538,12 +567,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock,
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
-int __sched mutex_lock_killable_nested(struct mutex *lock,
- unsigned int subclass)
+int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest_lock)
{
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
{
@@ -557,8 +586,29 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
}
EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+int __sched _mutex_trylock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
#else /* CONFIG_DEBUG_LOCK_ALLOC */
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+ __mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
void __sched mutex_lock(struct mutex *lock)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
@@ -585,22 +635,16 @@ void __sched mutex_lock_io(struct mutex *lock)
io_schedule_finish(token);
}
EXPORT_SYMBOL(mutex_lock_io);
-#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
int __sched mutex_trylock(struct mutex *lock)
{
- int ret;
-
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
return 0;
- ret = __rt_mutex_trylock(&lock->rtmutex);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-
- return ret;
+ return __rt_mutex_trylock(&lock->rtmutex);
}
EXPORT_SYMBOL(mutex_trylock);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
void __sched mutex_unlock(struct mutex *lock)
{
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index c47e8361bfb5..cf6ddd1b23a2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -17,33 +17,50 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
* @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
- * @prio: Priority of the waiter
- * @deadline: Deadline of the waiter if applicable
* @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
struct rt_mutex_base *lock;
unsigned int wake_state;
- int prio;
- u64 deadline;
struct ww_acquire_ctx *ww_ctx;
};
/**
- * rt_wake_q_head - Wrapper around regular wake_q_head to support
- * "sleeping" spinlocks on RT
+ * struct rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
* @head: The regular wake_q_head for sleeping lock variants
* @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
*/
@@ -66,7 +83,8 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task);
+ struct task_struct *task,
+ struct wake_q_head *);
extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -105,7 +123,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
- return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
}
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
@@ -113,8 +131,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -127,17 +147,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
-}
-
-#define RT_MUTEX_HAS_WAITERS 1UL
-
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
-{
- unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+ lockdep_assert_held(&p->pi_lock);
- return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+ pi_tree.entry);
}
/*
@@ -190,8 +203,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
waiter->wake_state = TASK_NORMAL;
waiter->task = NULL;
}
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 6fd3162e4098..9f4322c07486 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -69,18 +69,11 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
struct rt_mutex_base *rtm = &rwb->rtmutex;
+ DEFINE_WAKE_Q(wake_q);
int ret;
+ rwbase_pre_schedule();
raw_spin_lock_irq(&rtm->wait_lock);
- /*
- * Allow readers, as long as the writer has not completely
- * acquired the semaphore for write.
- */
- if (atomic_read(&rwb->readers) != WRITER_BIAS) {
- atomic_inc(&rwb->readers);
- raw_spin_unlock_irq(&rtm->wait_lock);
- return 0;
- }
/*
* Call into the slow lock path with the rtmutex->wait_lock
@@ -112,11 +105,13 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
* Reader2 to call up_read(), which might be unbound.
*/
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
/*
* For rwlocks this returns 0 unconditionally, so the below
* !ret conditionals are optimized out.
*/
- ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q);
/*
* On success the rtmutex is held, so there can't be a writer
@@ -127,15 +122,25 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
*/
if (!ret)
atomic_inc(&rwb->readers);
+
+ preempt_disable();
raw_spin_unlock_irq(&rtm->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
+
if (!ret)
rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
return ret;
}
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
+ lockdep_assert(!current->pi_blocked_on);
+
if (rwbase_read_trylock(rwb))
return 0;
@@ -242,16 +247,21 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
/* Force readers into slow path */
atomic_sub(READER_BIAS, &rwb->readers);
+ rwbase_pre_schedule();
+
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
if (__rwbase_write_trylock(rwb))
goto out_unlock;
rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
+ trace_contention_end(rwb, -EINTR);
return -EINTR;
}
@@ -265,9 +275,11 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
set_current_state(state);
}
rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
return 0;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 69aba4abe104..24df4d98f7d2 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,8 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <linux/hung_task.h>
+#include <trace/events/lock.h>
#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
@@ -34,7 +36,7 @@
/*
* The least significant 2 bits of the owner value has the following
* meanings when set.
- * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
* - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
*
* When the rwsem is reader-owned and a spinning writer has timed out,
@@ -132,14 +134,19 @@
* the owner value concurrently without lock. Read from owner, however,
* may not need READ_ONCE() as long as the pointer value is only used
* for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
*/
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, (long)current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, 0);
}
@@ -175,12 +182,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
__rwsem_set_reader_owned(sem, current);
}
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
+/*
+ * Return just the real task structure pointer of the owner
+ */
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
/*
* Return true if the rwsem is owned by a reader.
*/
-static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
{
-#ifdef CONFIG_DEBUG_RWSEMS
/*
* Check the count to see if it is write-locked.
*/
@@ -188,16 +204,14 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
if (count & RWSEM_WRITER_MASK)
return false;
-#endif
return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
}
-#ifdef CONFIG_DEBUG_RWSEMS
/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
*/
static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
{
@@ -260,15 +274,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
}
/*
- * Return just the real task structure pointer of the owner
- */
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
-{
- return (struct task_struct *)
- (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
-}
-
-/*
* Return the real task structure pointer of the owner and the embedded
* flags in the owner. pflags must be non-NULL.
*/
@@ -334,8 +339,6 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
-
- /* Writer only, not initialized in reader */
bool handoff_set;
};
#define rwsem_first_waiter(sem) \
@@ -375,16 +378,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
*
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
* this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
*/
-static inline void
+static inline bool
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_del(&waiter->list);
if (likely(!list_empty(&sem->wait_list)))
- return;
+ return true;
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
}
/*
@@ -455,10 +461,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -559,6 +567,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
@@ -568,7 +603,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -578,17 +613,24 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff) {
- if (!first)
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
return false;
-
- /* First waiter inherits a previously set handoff bit */
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (!rt_task(waiter->task) &&
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_or_dl_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -603,11 +645,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
@@ -669,7 +712,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
/*
* Disable preemption is equal to the RCU read-side crital section,
* thus the task_strcut structure won't go away.
@@ -681,14 +723,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
}
-#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
-
static inline enum owner_state
rwsem_owner_state(struct task_struct *owner, unsigned long flags)
{
@@ -781,8 +820,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -797,7 +834,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
enum owner_state owner_state;
owner_state = rwsem_spin_on_owner(sem);
- if (!(owner_state & OWNER_SPINNABLE))
+ if (owner_state == OWNER_NONSPINNABLE)
break;
/*
@@ -874,7 +911,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (owner_state != OWNER_WRITER) {
if (need_resched())
break;
- if (rt_task(current) &&
+ if (rt_or_dl_task(current) &&
(prev_owner_state != OWNER_WRITER))
break;
}
@@ -890,7 +927,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -901,7 +937,7 @@ done:
*/
static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
@@ -926,6 +962,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
#endif
/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
@@ -935,12 +996,11 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- bool wake = false;
/*
* To prevent a constant stream of readers from starving a sleeping
- * waiter, don't attempt optimistic lock stealing if the lock is
- * currently owned by readers.
+ * writer, don't attempt optimistic lock stealing if the lock is
+ * very likely owned by readers.
*/
if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
(rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
@@ -972,17 +1032,17 @@ queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
/*
* In case the wait queue is empty and the lock isn't owned
- * by a writer or has the handoff bit set, this reader can
- * exit the slowpath and return immediately as its
- * RWSEM_READER_BIAS has already been set in the count.
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
*/
- if (!(atomic_long_read(&sem->count) &
- (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
@@ -997,26 +1057,20 @@ queue:
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (!(count & RWSEM_LOCK_MASK)) {
- clear_nonspinnable(sem);
- wake = true;
- }
- if (wake || (!(count & RWSEM_WRITER_MASK) &&
- (adjustment & RWSEM_FLAG_WAITERS)))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
+ set_current_state(state);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
/* wait to be given the lock */
for (;;) {
- set_current_state(state);
if (!smp_load_acquire(&waiter.task)) {
/* Matches rwsem_mark_wake()'s smp_store_release(). */
break;
@@ -1029,29 +1083,33 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
+ set_current_state(state);
}
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
- rwsem_del_waiter(sem, &waiter);
- raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
@@ -1075,23 +1133,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* we're now waiting on the lock */
if (rwsem_first_waiter(sem) != &waiter) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and:
- * 1) there are no active locks, wake the front
- * queued process(es) as the handoff bit might be set.
- * 2) there are no active writers and some readers, the lock
- * must be read owned; so we try to wake any read lock
- * waiters that were queued ahead of us.
- */
- if (count & RWSEM_WRITER_MASK)
- goto wait;
-
- rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
- ? RWSEM_WAKE_READERS
- : RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
if (!wake_q_empty(&wake_q)) {
/*
* We want to minimize wait_lock hold time especially
@@ -1099,16 +1142,19 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
*/
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
- wake_q_init(&wake_q); /* Used again, reinit */
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
-wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1131,34 +1177,33 @@ wait:
if (waiter.handoff_set) {
enum owner_state owner_state;
- preempt_disable();
owner_state = rwsem_spin_on_owner(sem);
- preempt_enable();
-
if (owner_state == OWNER_NULL)
goto trylock_again;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- rwsem_del_waiter(sem, &waiter);
- if (!list_empty(&sem->wait_list))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1206,77 +1251,96 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline int __down_read_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
-static inline void __down_read(struct rw_semaphore *sem)
+static __always_inline void __down_read(struct rw_semaphore *sem)
{
__down_read_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_read_interruptible(struct rw_semaphore *sem)
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_INTERRUPTIBLE);
}
-static inline int __down_read_killable(struct rw_semaphore *sem)
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_KILLABLE);
}
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ preempt_disable();
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
}
- return 0;
+ preempt_enable();
+ return ret;
}
/*
* lock for writing
*/
-static inline int __down_write_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
-static inline void __down_write(struct rw_semaphore *sem)
+static __always_inline void __down_write(struct rw_semaphore *sem)
{
__down_write_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_write_killable(struct rw_semaphore *sem)
+static __always_inline int __down_write_killable(struct rw_semaphore *sem)
{
return __down_write_common(sem, TASK_KILLABLE);
}
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1289,6 +1353,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1297,6 +1362,7 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1314,10 +1380,12 @@ static inline void __up_write(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1335,11 +1403,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
#else /* !CONFIG_PREEMPT_RT */
@@ -1356,8 +1426,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_rtmutex_lock_state(rtm, state) \
__rt_mutex_lock(rtm, state)
-#define rwbase_rtmutex_slowlock_locked(rtm, state) \
- __rt_mutex_slowlock_locked(rtm, NULL, state)
+#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state, wq)
#define rwbase_rtmutex_unlock(rtm) \
__rt_mutex_unlock(rtm)
@@ -1368,8 +1438,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_signal_pending_state(state, current) \
signal_pending_state(state, current)
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
#define rwbase_schedule() \
- schedule()
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
#include "rwbase_rt.c"
@@ -1614,6 +1690,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..3ef032e22f7e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,15 +29,53 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
/**
* down - acquire the semaphore
@@ -50,14 +88,14 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -73,7 +111,7 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
@@ -81,7 +119,7 @@ int down_interruptible(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -100,7 +138,7 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
@@ -108,7 +146,7 @@ int down_killable(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -130,7 +168,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -138,7 +176,7 @@ int down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -155,7 +193,7 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
@@ -163,7 +201,7 @@ int down_timeout(struct semaphore *sem, long timeout)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -179,16 +217,22 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
+
+ hung_task_sem_clear_if_holder(sem);
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -205,7 +249,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -223,8 +267,10 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+ hung_task_sem_set_holder(sem);
return 0;
+ }
}
timed_out:
@@ -236,6 +282,22 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ hung_task_clear_blocker();
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
@@ -256,11 +318,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 7f49baaa4979..7685defd7c52 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
* towards that other CPU that it should break the lock ASAP.
*/
#define BUILD_LOCK_OPS(op, locktype) \
-void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
{ \
for (;;) { \
preempt_disable(); \
@@ -77,7 +77,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
} \
} \
\
-unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -95,12 +95,12 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
return flags; \
} \
\
-void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
{ \
_raw_##op##_lock_irqsave(lock); \
} \
\
-void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
@@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh);
#ifndef CONFIG_PREEMPT_RT
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
@@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
+
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
+void notrace lockdep_assert_in_softirq_func(void)
+{
+ lockdep_assert_in_softirq();
+}
+EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
+#endif
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 14235671a1a7..2338b3adfb55 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
struct lock_class_key *key, short inner)
@@ -183,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock)
static inline void debug_write_lock_before(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d..db1e11b45de6 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -37,6 +37,8 @@
static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
{
+ lockdep_assert(!current->pi_blocked_on);
+
if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
rtlock_slowlock(rtm);
}
@@ -49,7 +51,7 @@ static __always_inline void __rt_spin_lock(spinlock_t *lock)
migrate_disable();
}
-void __sched rt_spin_lock(spinlock_t *lock)
+void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU)
{
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
__rt_spin_lock(lock);
@@ -73,7 +75,7 @@ void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
EXPORT_SYMBOL(rt_spin_lock_nest_lock);
#endif
-void __sched rt_spin_unlock(spinlock_t *lock)
+void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU)
{
spin_release(&lock->dep_map, _RET_IP_);
migrate_enable();
@@ -160,9 +162,10 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
}
static __always_inline int
-rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state,
+ struct wake_q_head *wake_q)
{
- rtlock_slowlock_locked(rtm);
+ rtlock_slowlock_locked(rtm, wake_q);
return 0;
}
@@ -184,9 +187,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
#define rwbase_signal_pending_state(state, current) (0)
+#define rwbase_pre_schedule()
+
#define rwbase_schedule() \
schedule_rtlock()
+#define rwbase_post_schedule()
+
#include "rwbase_rt.c"
/*
* The common functions which get wrapped into the rwlock API.
@@ -219,7 +226,7 @@ int __sched rt_write_trylock(rwlock_t *rwlock)
}
EXPORT_SYMBOL(rt_write_trylock);
-void __sched rt_read_lock(rwlock_t *rwlock)
+void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU)
{
rtlock_might_resched();
rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
@@ -229,7 +236,7 @@ void __sched rt_read_lock(rwlock_t *rwlock)
}
EXPORT_SYMBOL(rt_read_lock);
-void __sched rt_write_lock(rwlock_t *rwlock)
+void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU)
{
rtlock_might_resched();
rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
@@ -240,7 +247,7 @@ void __sched rt_write_lock(rwlock_t *rwlock)
EXPORT_SYMBOL(rt_write_lock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU)
{
rtlock_might_resched();
rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
@@ -251,7 +258,7 @@ void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
EXPORT_SYMBOL(rt_write_lock_nested);
#endif
-void __sched rt_read_unlock(rwlock_t *rwlock)
+void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU)
{
rwlock_release(&rwlock->dep_map, _RET_IP_);
migrate_enable();
@@ -260,7 +267,7 @@ void __sched rt_read_unlock(rwlock_t *rwlock)
}
EXPORT_SYMBOL(rt_read_unlock);
-void __sched rt_write_unlock(rwlock_t *rwlock)
+void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU)
{
rwlock_release(&rwlock->dep_map, _RET_IP_);
rcu_read_unlock();
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 353004155d65..bcb1b9fea588 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,7 +9,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/ww_mutex.h>
@@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags)
int ret;
ww_mutex_init(&mtx.mutex, &ww_class);
- ww_acquire_init(&ctx, &ww_class);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_init(&ctx, &ww_class);
INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
init_completion(&mtx.ready);
@@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags)
ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
}
ww_mutex_unlock(&mtx.mutex);
- ww_acquire_fini(&ctx);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_fini(&ctx);
if (ret) {
pr_err("%s(flags=%x): mutual exclusion failure\n",
@@ -386,10 +388,23 @@ struct stress {
int nlocks;
};
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
static int *get_random_order(int count)
{
int *order;
- int n, r, tmp;
+ int n, r;
order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
@@ -399,12 +414,9 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
- if (r != n) {
- tmp = order[n];
- order[n] = order[r];
- order[r] = tmp;
- }
+ r = prandom_u32_below(n + 1);
+ if (r != n)
+ swap(order[n], order[r]);
}
return order;
@@ -452,21 +464,21 @@ retry:
ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) {
- ww_mutex_lock_slow(&locks[order[contended]], &ctx);
- goto retry;
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
}
+ ww_acquire_fini(&ctx);
if (err) {
pr_err_once("stress (%s) failed with %d\n",
__func__, err);
break;
}
-
- ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -531,14 +543,13 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
@@ -552,8 +563,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -564,15 +573,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -596,9 +614,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -613,6 +629,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
@@ -625,6 +642,8 @@ static int __init test_ww_mutex_init(void)
printk(KERN_INFO "Beginning ww mutex selftests\n");
+ prandom_seed_state(&rng, get_random_u64());
+
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
return -ENOMEM;
@@ -659,7 +678,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
@@ -677,3 +696,4 @@ module_exit(test_ww_mutex_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("API test facility for ww_mutexes");
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 56f139201f24..31a785afee6c 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -70,14 +70,14 @@ __ww_mutex_has_waiters(struct mutex *lock)
return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
}
-static inline void lock_wait_lock(struct mutex *lock)
+static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags)
{
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, *flags);
}
-static inline void unlock_wait_lock(struct mutex *lock)
+static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags)
{
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, *flags);
}
static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_next(&w->tree_entry);
+ struct rb_node *n = rb_next(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_prev(&w->tree_entry);
+ struct rb_node *n = rb_prev(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline void
@@ -144,14 +144,14 @@ __ww_mutex_has_waiters(struct rt_mutex *lock)
return rt_mutex_has_waiters(&lock->rtmutex);
}
-static inline void lock_wait_lock(struct rt_mutex *lock)
+static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
{
- raw_spin_lock(&lock->rtmutex.wait_lock);
+ raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags);
}
-static inline void unlock_wait_lock(struct rt_mutex *lock)
+static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
{
- raw_spin_unlock(&lock->rtmutex.wait_lock);
+ raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags);
}
static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
int a_prio = a->task->prio;
int b_prio = b->task->prio;
- if (rt_prio(a_prio) || rt_prio(b_prio)) {
+ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
if (a_prio > b_prio)
return true;
@@ -275,7 +275,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
*/
static bool
__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q)
{
if (!ww_ctx->is_wait_die)
return false;
@@ -284,7 +284,13 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
#ifndef WW_RT
debug_mutex_wake_waiter(lock, waiter);
#endif
- wake_up_process(waiter->task);
+ /*
+ * When waking up the task to die, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(waiter->task, lock);
+ wake_q_add(wake_q, waiter->task);
}
return true;
@@ -299,7 +305,8 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
*/
static bool __ww_mutex_wound(struct MUTEX *lock,
struct ww_acquire_ctx *ww_ctx,
- struct ww_acquire_ctx *hold_ctx)
+ struct ww_acquire_ctx *hold_ctx,
+ struct wake_q_head *wake_q)
{
struct task_struct *owner = __ww_mutex_owner(lock);
@@ -330,9 +337,19 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* it's wounded in __ww_mutex_check_kill() or has a
* wakeup pending to re-read the wounded state.
*/
- if (owner != current)
- wake_up_process(owner);
-
+ if (owner != current) {
+ /*
+ * When waking up the task to wound, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ *
+ * NOTE: We pass NULL here instead of lock, because we
+ * are waking the mutex owner, who may be currently
+ * blocked on a different mutex.
+ */
+ __clear_task_blocked_on(owner, NULL);
+ wake_q_add(wake_q, owner);
+ }
return true;
}
@@ -352,7 +369,8 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* The current task must not be on the wait list.
*/
static void
-__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
struct MUTEX_WAITER *cur;
@@ -364,8 +382,8 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
if (!cur->ww_ctx)
continue;
- if (__ww_mutex_die(lock, cur, ww_ctx) ||
- __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q))
break;
}
}
@@ -377,6 +395,9 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
static __always_inline void
ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+
ww_mutex_lock_acquired(lock, ctx);
/*
@@ -404,9 +425,12 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
* Uh oh, we raced in fastpath, check if any of the waiters need to
* die or wound us.
*/
- lock_wait_lock(&lock->base);
- __ww_mutex_check_waiters(&lock->base, ctx);
- unlock_wait_lock(&lock->base);
+ lock_wait_lock(&lock->base, &flags);
+ __ww_mutex_check_waiters(&lock->base, ctx, &wake_q);
+ preempt_disable();
+ unlock_wait_lock(&lock->base, &flags);
+ wake_up_q(&wake_q);
+ preempt_enable();
}
static __always_inline int
@@ -488,7 +512,8 @@ __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
static inline int
__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
struct MUTEX *lock,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
struct MUTEX_WAITER *cur, *pos = NULL;
bool is_wait_die;
@@ -532,7 +557,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
pos = cur;
/* Wait-Die: ensure younger waiters die. */
- __ww_mutex_die(lock, cur, ww_ctx);
+ __ww_mutex_die(lock, cur, ww_ctx, wake_q);
}
__ww_waiter_add(lock, waiter, pos);
@@ -550,7 +575,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
* such that either we or the fastpath will wound @ww->ctx.
*/
smp_mb();
- __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q);
}
return 0;
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index d1473c624105..c7196de838ed 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
}
mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
- if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
if (ww_ctx)
ww_mutex_set_context_fastpath(lock, ww_ctx);
return 0;
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 8c381c99062f..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/elf.h>
-#include <asm/module.h>
-
-struct load_info {
- const char *name;
- /* pointer to module in temporary copy, freed at end of load_module() */
- struct module *mod;
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
-#ifdef CONFIG_KALLSYMS
- unsigned long mod_kallsyms_init_off;
-#endif
-#ifdef CONFIG_MODULE_DECOMPRESS
- struct page **pages;
- unsigned int max_pages;
- unsigned int used_pages;
-#endif
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
-extern int mod_verify_sig(const void *mod, struct load_info *info);
-
-#ifdef CONFIG_MODULE_DECOMPRESS
-int module_decompress(struct load_info *info, const void *buf, size_t size);
-void module_decompress_cleanup(struct load_info *info);
-#else
-static inline int module_decompress(struct load_info *info,
- const void *buf, size_t size)
-{
- return -EOPNOTSUPP;
-}
-static inline void module_decompress_cleanup(struct load_info *info)
-{
-}
-#endif
diff --git a/kernel/module.c b/kernel/module.c
deleted file mode 100644
index 24dab046e16c..000000000000
--- a/kernel/module.c
+++ /dev/null
@@ -1,4825 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2002 Richard Henderson
- * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
- */
-
-#define INCLUDE_VERMAGIC
-
-#include <linux/export.h>
-#include <linux/extable.h>
-#include <linux/moduleloader.h>
-#include <linux/module_signature.h>
-#include <linux/trace_events.h>
-#include <linux/init.h>
-#include <linux/kallsyms.h>
-#include <linux/buildid.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/sysfs.h>
-#include <linux/kernel.h>
-#include <linux/kernel_read_file.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/fcntl.h>
-#include <linux/rcupdate.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/moduleparam.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/vermagic.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <linux/set_memory.h>
-#include <asm/mmu_context.h>
-#include <linux/license.h>
-#include <asm/sections.h>
-#include <linux/tracepoint.h>
-#include <linux/ftrace.h>
-#include <linux/livepatch.h>
-#include <linux/async.h>
-#include <linux/percpu.h>
-#include <linux/kmemleak.h>
-#include <linux/jump_label.h>
-#include <linux/pfn.h>
-#include <linux/bsearch.h>
-#include <linux/dynamic_debug.h>
-#include <linux/audit.h>
-#include <uapi/linux/module.h>
-#include "module-internal.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/module.h>
-
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
-/*
- * Mutex protects:
- * 1) List of modules (also safely readable with preempt_disable),
- * 2) module_use links,
- * 3) module_addr_min/module_addr_max.
- * (delete and add uses RCU list operations).
- */
-static DEFINE_MUTEX(module_mutex);
-static LIST_HEAD(modules);
-
-/* Work queue for freeing init sections in success case */
-static void do_free_init(struct work_struct *w);
-static DECLARE_WORK(init_free_wq, do_free_init);
-static LLIST_HEAD(init_free_list);
-
-#ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
- return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
- unsigned long val = (unsigned long)key;
- unsigned long start, end;
-
- start = __mod_tree_val(n);
- if (val < start)
- return -1;
-
- end = start + __mod_tree_size(n);
- if (val >= end)
- return 1;
-
- return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
- .less = mod_tree_less,
- .comp = mod_tree_comp,
-};
-
-static struct mod_tree_root {
- struct latch_tree_root root;
- unsigned long addr_min;
- unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
- .addr_min = -1UL,
-};
-
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
- latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
- latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
- mod->core_layout.mtn.mod = mod;
- mod->init_layout.mtn.mod = mod;
-
- __mod_tree_insert(&mod->core_layout.mtn);
- if (mod->init_layout.size)
- __mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
- if (mod->init_layout.size)
- __mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
- __mod_tree_remove(&mod->core_layout.mtn);
- mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
- struct latch_tree_node *ltn;
-
- ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
- if (!ltn)
- return NULL;
-
- return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
- struct module *mod;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (within_module(addr, mod))
- return mod;
- }
-
- return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
-
-/*
- * Bounds of module text, for speeding up __module_address.
- * Protected by module_mutex.
- */
-static void __mod_update_bounds(void *base, unsigned int size)
-{
- unsigned long min = (unsigned long)base;
- unsigned long max = min + size;
-
- if (min < module_addr_min)
- module_addr_min = min;
- if (max > module_addr_max)
- module_addr_max = max;
-}
-
-static void mod_update_bounds(struct module *mod)
-{
- __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
- if (mod->init_layout.size)
- __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
-}
-
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
- if (unlikely(!debug_locks))
- return;
-
- WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
- !lockdep_is_held(&module_mutex));
-#endif
-}
-
-#ifdef CONFIG_MODULE_SIG
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-void set_module_sig_enforced(void)
-{
- sig_enforce = true;
-}
-#else
-#define sig_enforce false
-#endif
-
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
- return sig_enforce;
-}
-EXPORT_SYMBOL(is_module_sig_enforced);
-
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-core_param(nomodule, modules_disabled, bint, 0);
-
-/* Waiting for a module to finish initializing? */
-static DECLARE_WAIT_QUEUE_HEAD(module_wq);
-
-static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-
-int register_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(register_module_notifier);
-
-int unregister_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(unregister_module_notifier);
-
-/*
- * We require a truly strong try_module_get(): 0 means success.
- * Otherwise an error is returned due to ongoing or failed
- * initialization etc.
- */
-static inline int strong_try_module_get(struct module *mod)
-{
- BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
- if (mod && mod->state == MODULE_STATE_COMING)
- return -EBUSY;
- if (try_module_get(mod))
- return 0;
- else
- return -ENOENT;
-}
-
-static inline void add_taint_module(struct module *mod, unsigned flag,
- enum lockdep_ok lockdep_ok)
-{
- add_taint(flag, lockdep_ok);
- set_bit(flag, &mod->taints);
-}
-
-/*
- * A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
- */
-void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
-{
- module_put(mod);
- kthread_exit(code);
-}
-EXPORT_SYMBOL(__module_put_and_kthread_exit);
-
-/* Find a module section: 0 means not found. */
-static unsigned int find_sec(const struct load_info *info, const char *name)
-{
- unsigned int i;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- /* Alloc bit cleared means "ignore it." */
- if ((shdr->sh_flags & SHF_ALLOC)
- && strcmp(info->secstrings + shdr->sh_name, name) == 0)
- return i;
- }
- return 0;
-}
-
-/* Find a module section, or NULL. */
-static void *section_addr(const struct load_info *info, const char *name)
-{
- /* Section 0 has sh_addr 0. */
- return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
-}
-
-/* Find a module section, or NULL. Fill in number of "objects" in section. */
-static void *section_objs(const struct load_info *info,
- const char *name,
- size_t object_size,
- unsigned int *num)
-{
- unsigned int sec = find_sec(info, name);
-
- /* Section 0 has sh_addr 0 and sh_size 0. */
- *num = info->sechdrs[sec].sh_size / object_size;
- return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
-static unsigned int find_any_sec(const struct load_info *info, const char *name)
-{
- unsigned int i;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
- return i;
- }
- return 0;
-}
-
-/*
- * Find a module section, or NULL. Fill in number of "objects" in section.
- * Ignores SHF_ALLOC flag.
- */
-static __maybe_unused void *any_section_objs(const struct load_info *info,
- const char *name,
- size_t object_size,
- unsigned int *num)
-{
- unsigned int sec = find_any_sec(info, name);
-
- /* Section 0 has sh_addr 0 and sh_size 0. */
- *num = info->sechdrs[sec].sh_size / object_size;
- return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-
-#ifndef CONFIG_MODVERSIONS
-#define symversion(base, idx) NULL
-#else
-#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
-#endif
-
-struct symsearch {
- const struct kernel_symbol *start, *stop;
- const s32 *crcs;
- enum mod_license {
- NOT_GPL_ONLY,
- GPL_ONLY,
- } license;
-};
-
-struct find_symbol_arg {
- /* Input */
- const char *name;
- bool gplok;
- bool warn;
-
- /* Output */
- struct module *owner;
- const s32 *crc;
- const struct kernel_symbol *sym;
- enum mod_license license;
-};
-
-static bool check_exported_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
-{
- struct find_symbol_arg *fsa = data;
-
- if (!fsa->gplok && syms->license == GPL_ONLY)
- return false;
- fsa->owner = owner;
- fsa->crc = symversion(syms->crcs, symnum);
- fsa->sym = &syms->start[symnum];
- fsa->license = syms->license;
- return true;
-}
-
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
- return sym->value;
-#endif
-}
-
-static const char *kernel_symbol_name(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return offset_to_ptr(&sym->name_offset);
-#else
- return sym->name;
-#endif
-}
-
-static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- if (!sym->namespace_offset)
- return NULL;
- return offset_to_ptr(&sym->namespace_offset);
-#else
- return sym->namespace;
-#endif
-}
-
-static int cmp_name(const void *name, const void *sym)
-{
- return strcmp(name, kernel_symbol_name(sym));
-}
-
-static bool find_exported_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- void *data)
-{
- struct find_symbol_arg *fsa = data;
- struct kernel_symbol *sym;
-
- sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
- sizeof(struct kernel_symbol), cmp_name);
-
- if (sym != NULL && check_exported_symbol(syms, owner,
- sym - syms->start, data))
- return true;
-
- return false;
-}
-
-/*
- * Find an exported symbol and return it, along with, (optional) crc and
- * (optional) module which owns it. Needs preempt disabled or module_mutex.
- */
-static bool find_symbol(struct find_symbol_arg *fsa)
-{
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY },
- };
- struct module *mod;
- unsigned int i;
-
- module_assert_mutex_or_preempt();
-
- for (i = 0; i < ARRAY_SIZE(arr); i++)
- if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
- return true;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY },
- };
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- for (i = 0; i < ARRAY_SIZE(arr); i++)
- if (find_exported_symbol_in_section(&arr[i], mod, fsa))
- return true;
- }
-
- pr_debug("Failed to find symbol %s\n", fsa->name);
- return false;
-}
-
-/*
- * Search for module by name: must hold module_mutex (or preempt disabled
- * for read-only access).
- */
-static struct module *find_module_all(const char *name, size_t len,
- bool even_unformed)
-{
- struct module *mod;
-
- module_assert_mutex_or_preempt();
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
- return mod;
- }
- return NULL;
-}
-
-struct module *find_module(const char *name)
-{
- return find_module_all(name, strlen(name), false);
-}
-
-#ifdef CONFIG_SMP
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return mod->percpu;
-}
-
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
- unsigned long align = pcpusec->sh_addralign;
-
- if (!pcpusec->sh_size)
- return 0;
-
- if (align > PAGE_SIZE) {
- pr_warn("%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
- align = PAGE_SIZE;
- }
-
- mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
- if (!mod->percpu) {
- pr_warn("%s: Could not allocate %lu bytes percpu data\n",
- mod->name, (unsigned long)pcpusec->sh_size);
- return -ENOMEM;
- }
- mod->percpu_size = pcpusec->sh_size;
- return 0;
-}
-
-static void percpu_modfree(struct module *mod)
-{
- free_percpu(mod->percpu);
-}
-
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return find_sec(info, ".data..percpu");
-}
-
-static void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
- struct module *mod;
- unsigned int cpu;
-
- preempt_disable();
-
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (!mod->percpu_size)
- continue;
- for_each_possible_cpu(cpu) {
- void *start = per_cpu_ptr(mod->percpu, cpu);
- void *va = (void *)addr;
-
- if (va >= start && va < start + mod->percpu_size) {
- if (can_addr) {
- *can_addr = (unsigned long) (va - start);
- *can_addr += (unsigned long)
- per_cpu_ptr(mod->percpu,
- get_boot_cpu_id());
- }
- preempt_enable();
- return true;
- }
- }
- }
-
- preempt_enable();
- return false;
-}
-
-/**
- * is_module_percpu_address() - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * Return: %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
-{
- return __is_module_percpu_address(addr, NULL);
-}
-
-#else /* ... !CONFIG_SMP */
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return NULL;
-}
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- /* UP modules shouldn't have this section: ENOMEM isn't quite right */
- if (info->sechdrs[info->index.pcpu].sh_size != 0)
- return -ENOMEM;
- return 0;
-}
-static inline void percpu_modfree(struct module *mod)
-{
-}
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return 0;
-}
-static inline void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- /* pcpusec should be 0, and size of that section should be 0. */
- BUG_ON(size != 0);
-}
-bool is_module_percpu_address(unsigned long addr)
-{
- return false;
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
-#define MODINFO_ATTR(field) \
-static void setup_modinfo_##field(struct module *mod, const char *s) \
-{ \
- mod->field = kstrdup(s, GFP_KERNEL); \
-} \
-static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
- struct module_kobject *mk, char *buffer) \
-{ \
- return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
-} \
-static int modinfo_##field##_exists(struct module *mod) \
-{ \
- return mod->field != NULL; \
-} \
-static void free_modinfo_##field(struct module *mod) \
-{ \
- kfree(mod->field); \
- mod->field = NULL; \
-} \
-static struct module_attribute modinfo_##field = { \
- .attr = { .name = __stringify(field), .mode = 0444 }, \
- .show = show_modinfo_##field, \
- .setup = setup_modinfo_##field, \
- .test = modinfo_##field##_exists, \
- .free = free_modinfo_##field, \
-};
-
-MODINFO_ATTR(version);
-MODINFO_ATTR(srcversion);
-
-static char last_unloaded_module[MODULE_NAME_LEN+1];
-
-#ifdef CONFIG_MODULE_UNLOAD
-
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
-/* MODULE_REF_BASE is the base reference count by kmodule loader. */
-#define MODULE_REF_BASE 1
-
-/* Init the unload section of the module. */
-static int module_unload_init(struct module *mod)
-{
- /*
- * Initialize reference counter to MODULE_REF_BASE.
- * refcnt == 0 means module is going.
- */
- atomic_set(&mod->refcnt, MODULE_REF_BASE);
-
- INIT_LIST_HEAD(&mod->source_list);
- INIT_LIST_HEAD(&mod->target_list);
-
- /* Hold reference count during initialization. */
- atomic_inc(&mod->refcnt);
-
- return 0;
-}
-
-/* Does a already use b? */
-static int already_uses(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- list_for_each_entry(use, &b->source_list, source_list) {
- if (use->source == a) {
- pr_debug("%s uses %s!\n", a->name, b->name);
- return 1;
- }
- }
- pr_debug("%s does not use %s!\n", a->name, b->name);
- return 0;
-}
-
-/*
- * Module a uses b
- * - we add 'a' as a "source", 'b' as a "target" of module use
- * - the module_use is added to the list of 'b' sources (so
- * 'b' can walk the list to see who sourced them), and of 'a'
- * targets (so 'a' can see what modules it targets).
- */
-static int add_module_usage(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- pr_debug("Allocating new usage for %s.\n", a->name);
- use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use)
- return -ENOMEM;
-
- use->source = a;
- use->target = b;
- list_add(&use->source_list, &b->source_list);
- list_add(&use->target_list, &a->target_list);
- return 0;
-}
-
-/* Module a uses b: caller needs module_mutex() */
-static int ref_module(struct module *a, struct module *b)
-{
- int err;
-
- if (b == NULL || already_uses(a, b))
- return 0;
-
- /* If module isn't available, we fail. */
- err = strong_try_module_get(b);
- if (err)
- return err;
-
- err = add_module_usage(a, b);
- if (err) {
- module_put(b);
- return err;
- }
- return 0;
-}
-
-/* Clear the unload stuff of the module. */
-static void module_unload_free(struct module *mod)
-{
- struct module_use *use, *tmp;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
- struct module *i = use->target;
- pr_debug("%s unusing %s\n", mod->name, i->name);
- module_put(i);
- list_del(&use->source_list);
- list_del(&use->target_list);
- kfree(use);
- }
- mutex_unlock(&module_mutex);
-}
-
-#ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force_unload(unsigned int flags)
-{
- int ret = (flags & O_TRUNC);
- if (ret)
- add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
- return ret;
-}
-#else
-static inline int try_force_unload(unsigned int flags)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-
-/* Try to release refcount of module, 0 means success. */
-static int try_release_module_ref(struct module *mod)
-{
- int ret;
-
- /* Try to decrement refcnt which we set at loading */
- ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
- BUG_ON(ret < 0);
- if (ret)
- /* Someone can put this right now, recover with checking */
- ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
-
- return ret;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- /* If it's not unused, quit unless we're forcing. */
- if (try_release_module_ref(mod) != 0) {
- *forced = try_force_unload(flags);
- if (!(*forced))
- return -EWOULDBLOCK;
- }
-
- /* Mark it as dying. */
- mod->state = MODULE_STATE_GOING;
-
- return 0;
-}
-
-/**
- * module_refcount() - return the refcount or -1 if unloading
- * @mod: the module we're checking
- *
- * Return:
- * -1 if the module is in the process of unloading
- * otherwise the number of references in the kernel to the module
- */
-int module_refcount(struct module *mod)
-{
- return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
-}
-EXPORT_SYMBOL(module_refcount);
-
-/* This exists whether we can unload or not */
-static void free_module(struct module *mod);
-
-SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
- unsigned int, flags)
-{
- struct module *mod;
- char name[MODULE_NAME_LEN];
- int ret, forced = 0;
-
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
- return -EFAULT;
- name[MODULE_NAME_LEN-1] = '\0';
-
- audit_log_kern_module(name);
-
- if (mutex_lock_interruptible(&module_mutex) != 0)
- return -EINTR;
-
- mod = find_module(name);
- if (!mod) {
- ret = -ENOENT;
- goto out;
- }
-
- if (!list_empty(&mod->source_list)) {
- /* Other modules depend on us: get rid of them first. */
- ret = -EWOULDBLOCK;
- goto out;
- }
-
- /* Doing init or already dying? */
- if (mod->state != MODULE_STATE_LIVE) {
- /* FIXME: if (force), slam module count damn the torpedoes */
- pr_debug("%s already dying\n", mod->name);
- ret = -EBUSY;
- goto out;
- }
-
- /* If it has an init func, it must have an exit func to unload */
- if (mod->init && !mod->exit) {
- forced = try_force_unload(flags);
- if (!forced) {
- /* This module can't be removed */
- ret = -EBUSY;
- goto out;
- }
- }
-
- ret = try_stop_module(mod, flags, &forced);
- if (ret != 0)
- goto out;
-
- mutex_unlock(&module_mutex);
- /* Final destruction now no one is using it. */
- if (mod->exit != NULL)
- mod->exit();
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- ftrace_release_mod(mod);
-
- async_synchronize_full();
-
- /* Store the name of the last unloaded module for diagnostic purposes */
- strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-
- free_module(mod);
- /* someone could wait for the module in add_unformed_module() */
- wake_up_all(&module_wq);
- return 0;
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- struct module_use *use;
- int printed_something = 0;
-
- seq_printf(m, " %i ", module_refcount(mod));
-
- /*
- * Always include a trailing , so userspace can differentiate
- * between this and the old multi-field proc format.
- */
- list_for_each_entry(use, &mod->source_list, source_list) {
- printed_something = 1;
- seq_printf(m, "%s,", use->source->name);
- }
-
- if (mod->init != NULL && mod->exit == NULL) {
- printed_something = 1;
- seq_puts(m, "[permanent],");
- }
-
- if (!printed_something)
- seq_puts(m, "-");
-}
-
-void __symbol_put(const char *symbol)
-{
- struct find_symbol_arg fsa = {
- .name = symbol,
- .gplok = true,
- };
-
- preempt_disable();
- BUG_ON(!find_symbol(&fsa));
- module_put(fsa.owner);
- preempt_enable();
-}
-EXPORT_SYMBOL(__symbol_put);
-
-/* Note this assumes addr is a function, which it currently always is. */
-void symbol_put_addr(void *addr)
-{
- struct module *modaddr;
- unsigned long a = (unsigned long)dereference_function_descriptor(addr);
-
- if (core_kernel_text(a))
- return;
-
- /*
- * Even though we hold a reference on the module; we still need to
- * disable preemption in order to safely traverse the data structure.
- */
- preempt_disable();
- modaddr = __module_text_address(a);
- BUG_ON(!modaddr);
- module_put(modaddr);
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(symbol_put_addr);
-
-static ssize_t show_refcnt(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%i\n", module_refcount(mk->mod));
-}
-
-static struct module_attribute modinfo_refcnt =
- __ATTR(refcnt, 0444, show_refcnt, NULL);
-
-void __module_get(struct module *module)
-{
- if (module) {
- preempt_disable();
- atomic_inc(&module->refcnt);
- trace_module_get(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(__module_get);
-
-bool try_module_get(struct module *module)
-{
- bool ret = true;
-
- if (module) {
- preempt_disable();
- /* Note: here, we can fail to get a reference */
- if (likely(module_is_live(module) &&
- atomic_inc_not_zero(&module->refcnt) != 0))
- trace_module_get(module, _RET_IP_);
- else
- ret = false;
-
- preempt_enable();
- }
- return ret;
-}
-EXPORT_SYMBOL(try_module_get);
-
-void module_put(struct module *module)
-{
- int ret;
-
- if (module) {
- preempt_disable();
- ret = atomic_dec_if_positive(&module->refcnt);
- WARN_ON(ret < 0); /* Failed to put refcount */
- trace_module_put(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(module_put);
-
-#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- /* We don't know the usage count, or what modules are using. */
- seq_puts(m, " - -");
-}
-
-static inline void module_unload_free(struct module *mod)
-{
-}
-
-static int ref_module(struct module *a, struct module *b)
-{
- return strong_try_module_get(b);
-}
-
-static inline int module_unload_init(struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_UNLOAD */
-
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
- size_t l = 0;
- int i;
-
- for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &mod->taints))
- buf[l++] = taint_flags[i].c_true;
- }
-
- return l;
-}
-
-static ssize_t show_initstate(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- const char *state = "unknown";
-
- switch (mk->mod->state) {
- case MODULE_STATE_LIVE:
- state = "live";
- break;
- case MODULE_STATE_COMING:
- state = "coming";
- break;
- case MODULE_STATE_GOING:
- state = "going";
- break;
- default:
- BUG();
- }
- return sprintf(buffer, "%s\n", state);
-}
-
-static struct module_attribute modinfo_initstate =
- __ATTR(initstate, 0444, show_initstate, NULL);
-
-static ssize_t store_uevent(struct module_attribute *mattr,
- struct module_kobject *mk,
- const char *buffer, size_t count)
-{
- int rc;
-
- rc = kobject_synth_uevent(&mk->kobj, buffer, count);
- return rc ? rc : count;
-}
-
-struct module_attribute module_uevent =
- __ATTR(uevent, 0200, NULL, store_uevent);
-
-static ssize_t show_coresize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
-}
-
-static struct module_attribute modinfo_coresize =
- __ATTR(coresize, 0444, show_coresize, NULL);
-
-static ssize_t show_initsize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
-}
-
-static struct module_attribute modinfo_initsize =
- __ATTR(initsize, 0444, show_initsize, NULL);
-
-static ssize_t show_taint(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- size_t l;
-
- l = module_flags_taint(mk->mod, buffer);
- buffer[l++] = '\n';
- return l;
-}
-
-static struct module_attribute modinfo_taint =
- __ATTR(taint, 0444, show_taint, NULL);
-
-static struct module_attribute *modinfo_attrs[] = {
- &module_uevent,
- &modinfo_version,
- &modinfo_srcversion,
- &modinfo_initstate,
- &modinfo_coresize,
- &modinfo_initsize,
- &modinfo_taint,
-#ifdef CONFIG_MODULE_UNLOAD
- &modinfo_refcnt,
-#endif
- NULL,
-};
-
-static const char vermagic[] = VERMAGIC_STRING;
-
-static int try_to_force_load(struct module *mod, const char *reason)
-{
-#ifdef CONFIG_MODULE_FORCE_LOAD
- if (!test_taint(TAINT_FORCED_MODULE))
- pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
- return 0;
-#else
- return -ENOEXEC;
-#endif
-}
-
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
- return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- Elf_Shdr *sechdrs = info->sechdrs;
- unsigned int versindex = info->index.vers;
- unsigned int i, num_versions;
- struct modversion_info *versions;
-
- /* Exporting module didn't supply crcs? OK, we're already tainted. */
- if (!crc)
- return 1;
-
- /* No versions at all? modprobe --force does this. */
- if (versindex == 0)
- return try_to_force_load(mod, symname) == 0;
-
- versions = (void *) sechdrs[versindex].sh_addr;
- num_versions = sechdrs[versindex].sh_size
- / sizeof(struct modversion_info);
-
- for (i = 0; i < num_versions; i++) {
- u32 crcval;
-
- if (strcmp(versions[i].name, symname) != 0)
- continue;
-
- if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
- crcval = resolve_rel_crc(crc);
- else
- crcval = *crc;
- if (versions[i].crc == crcval)
- return 1;
- pr_debug("Found checksum %X vs module %lX\n",
- crcval, versions[i].crc);
- goto bad_version;
- }
-
- /* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
- return 1;
-
-bad_version:
- pr_warn("%s: disagrees about version of symbol %s\n",
- info->name, symname);
- return 0;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- struct find_symbol_arg fsa = {
- .name = "module_layout",
- .gplok = true,
- };
-
- /*
- * Since this should be found in kernel (which can't be removed), no
- * locking is necessary -- use preempt_disable() to placate lockdep.
- */
- preempt_disable();
- if (!find_symbol(&fsa)) {
- preempt_enable();
- BUG();
- }
- preempt_enable();
- return check_version(info, "module_layout", mod, fsa.crc);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- if (has_crcs) {
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
- }
- return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- return 1;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
-static char *get_modinfo(const struct load_info *info, const char *tag);
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev);
-
-static int verify_namespace_is_imported(const struct load_info *info,
- const struct kernel_symbol *sym,
- struct module *mod)
-{
- const char *namespace;
- char *imported_namespace;
-
- namespace = kernel_symbol_namespace(sym);
- if (namespace && namespace[0]) {
- imported_namespace = get_modinfo(info, "import_ns");
- while (imported_namespace) {
- if (strcmp(namespace, imported_namespace) == 0)
- return 0;
- imported_namespace = get_next_modinfo(
- info, "import_ns", imported_namespace);
- }
-#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
- pr_warn(
-#else
- pr_err(
-#endif
- "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
- mod->name, kernel_symbol_name(sym), namespace);
-#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
- return -EINVAL;
-#endif
- }
- return 0;
-}
-
-static bool inherit_taint(struct module *mod, struct module *owner)
-{
- if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
- return true;
-
- if (mod->using_gplonly_symbols) {
- pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
- mod->name, owner->name);
- return false;
- }
-
- if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
- pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
- mod->name, owner->name);
- set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
- }
- return true;
-}
-
-/* Resolve a symbol for this module. I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(struct module *mod,
- const struct load_info *info,
- const char *name,
- char ownername[])
-{
- struct find_symbol_arg fsa = {
- .name = name,
- .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
- .warn = true,
- };
- int err;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- if (!find_symbol(&fsa))
- goto unlock;
-
- if (fsa.license == GPL_ONLY)
- mod->using_gplonly_symbols = true;
-
- if (!inherit_taint(mod, fsa.owner)) {
- fsa.sym = NULL;
- goto getname;
- }
-
- if (!check_version(info, name, mod, fsa.crc)) {
- fsa.sym = ERR_PTR(-EINVAL);
- goto getname;
- }
-
- err = verify_namespace_is_imported(info, fsa.sym, mod);
- if (err) {
- fsa.sym = ERR_PTR(err);
- goto getname;
- }
-
- err = ref_module(mod, fsa.owner);
- if (err) {
- fsa.sym = ERR_PTR(err);
- goto getname;
- }
-
-getname:
- /* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
-unlock:
- mutex_unlock(&module_mutex);
- return fsa.sym;
-}
-
-static const struct kernel_symbol *
-resolve_symbol_wait(struct module *mod,
- const struct load_info *info,
- const char *name)
-{
- const struct kernel_symbol *ksym;
- char owner[MODULE_NAME_LEN];
-
- if (wait_event_interruptible_timeout(module_wq,
- !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
- || PTR_ERR(ksym) != -EBUSY,
- 30 * HZ) <= 0) {
- pr_warn("%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
- }
- return ksym;
-}
-
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-#endif
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
- struct bin_attribute battr;
- unsigned long address;
-};
-
-struct module_sect_attrs {
- struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[];
-};
-
-#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *battr,
- char *buf, loff_t pos, size_t count)
-{
- struct module_sect_attr *sattr =
- container_of(battr, struct module_sect_attr, battr);
- char bounce[MODULE_SECT_READ_SIZE + 1];
- size_t wrote;
-
- if (pos != 0)
- return -EINVAL;
-
- /*
- * Since we're a binary read handler, we must account for the
- * trailing NUL byte that sprintf will write: if "buf" is
- * too small to hold the NUL, or the NUL is exactly the last
- * byte, the read will look like it got truncated by one byte.
- * Since there is no way to ask sprintf nicely to not write
- * the NUL, we have to use a bounce buffer.
- */
- wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
- kallsyms_show_value(file->f_cred)
- ? (void *)sattr->address : NULL);
- count = min(count, wrote);
- memcpy(buf, bounce, count);
-
- return count;
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
- unsigned int section;
-
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].battr.attr.name);
- kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int nloaded = 0, i, size[2];
- struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct bin_attribute **gattr;
-
- /* Count loaded sections and allocate structures */
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]))
- nloaded++;
- size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
- sizeof(sect_attrs->grp.bin_attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (sect_attrs == NULL)
- return;
-
- /* Setup section attributes. */
- sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
- sect_attrs->nsections = 0;
- sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.bin_attrs[0];
- for (i = 0; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *sec = &info->sechdrs[i];
- if (sect_empty(sec))
- continue;
- sysfs_bin_attr_init(&sattr->battr);
- sattr->address = sec->sh_addr;
- sattr->battr.attr.name =
- kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (sattr->battr.attr.name == NULL)
- goto out;
- sect_attrs->nsections++;
- sattr->battr.read = module_sect_read;
- sattr->battr.size = MODULE_SECT_READ_SIZE;
- sattr->battr.attr.mode = 0400;
- *(gattr++) = &(sattr++)->battr;
- }
- *gattr = NULL;
-
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
- goto out;
-
- mod->sect_attrs = sect_attrs;
- return;
- out:
- free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
- if (mod->sect_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->sect_attrs->grp);
- /*
- * We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately.
- */
- free_sect_attrs(mod->sect_attrs);
- mod->sect_attrs = NULL;
- }
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t pos, size_t count)
-{
- /*
- * The caller checked the pos and count against our size.
- */
- memcpy(buf, bin_attr->private + pos, count);
- return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
-{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
- kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int notes, loaded, i;
- struct module_notes_attrs *notes_attrs;
- struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
-
- /* Count notes sections and allocate structures. */
- notes = 0;
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]) &&
- (info->sechdrs[i].sh_type == SHT_NOTE))
- ++notes;
-
- if (notes == 0)
- return;
-
- notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
- GFP_KERNEL);
- if (notes_attrs == NULL)
- return;
-
- notes_attrs->notes = notes;
- nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
- if (sect_empty(&info->sechdrs[i]))
- continue;
- if (info->sechdrs[i].sh_type == SHT_NOTE) {
- sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
- nattr->attr.mode = S_IRUGO;
- nattr->size = info->sechdrs[i].sh_size;
- nattr->private = (void *) info->sechdrs[i].sh_addr;
- nattr->read = module_notes_read;
- ++nattr;
- }
- ++loaded;
- }
-
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
- goto out;
-
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
- goto out;
-
- mod->notes_attrs = notes_attrs;
- return;
-
- out:
- free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
- mutex_unlock(&module_mutex);
-#endif
-}
-
-static int add_usage_links(struct module *mod)
-{
- int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- ret = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- if (ret)
- break;
- }
- mutex_unlock(&module_mutex);
- if (ret)
- del_usage_links(mod);
-#endif
- return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- struct module_attribute *temp_attr;
- int error = 0;
- int i;
-
- mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
- (ARRAY_SIZE(modinfo_attrs) + 1)),
- GFP_KERNEL);
- if (!mod->modinfo_attrs)
- return -ENOMEM;
-
- temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (!attr->test || attr->test(mod)) {
- memcpy(temp_attr, attr, sizeof(*temp_attr));
- sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,
- &temp_attr->attr);
- if (error)
- goto error_out;
- ++temp_attr;
- }
- }
-
- return 0;
-
-error_out:
- if (i > 0)
- module_remove_modinfo_attrs(mod, --i);
- else
- kfree(mod->modinfo_attrs);
- return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
- if (end >= 0 && i > end)
- break;
- /* pick a field to test for end of list */
- if (!attr->attr.name)
- break;
- sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
- if (attr->free)
- attr->free(mod);
- }
- kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
- DECLARE_COMPLETION_ONSTACK(c);
- mod->mkobj.kobj_completion = &c;
- kobject_put(&mod->mkobj.kobj);
- wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
- int err;
- struct kobject *kobj;
-
- if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
- err = -EINVAL;
- goto out;
- }
-
- kobj = kset_find_obj(module_kset, mod->name);
- if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
- kobject_put(kobj);
- err = -EINVAL;
- goto out;
- }
-
- mod->mkobj.mod = mod;
-
- memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
- mod->mkobj.kobj.kset = module_kset;
- err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
- "%s", mod->name);
- if (err)
- mod_kobject_put(mod);
-
-out:
- return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- int err;
-
- err = mod_sysfs_init(mod);
- if (err)
- goto out;
-
- mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
- if (!mod->holders_dir) {
- err = -ENOMEM;
- goto out_unreg;
- }
-
- err = module_param_sysfs_setup(mod, kparam, num_params);
- if (err)
- goto out_unreg_holders;
-
- err = module_add_modinfo_attrs(mod);
- if (err)
- goto out_unreg_param;
-
- err = add_usage_links(mod);
- if (err)
- goto out_unreg_modinfo_attrs;
-
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
-
- return 0;
-
-out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
- module_param_sysfs_remove(mod);
-out_unreg_holders:
- kobject_put(mod->holders_dir);
-out_unreg:
- mod_kobject_put(mod);
-out:
- return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
- mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
- del_usage_links(mod);
- module_remove_modinfo_attrs(mod, -1);
- module_param_sysfs_remove(mod);
- kobject_put(mod->mkobj.drivers_dir);
- kobject_put(mod->holders_dir);
- mod_sysfs_fini(mod);
-}
-
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- * [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^ ^ ^ ^
- * ro_size ------------------------| | |
- * ro_after_init_size -----------------------------| |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base,
- layout->text_size >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
-}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->text_size,
- (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_after_init_size,
- (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
- if (!rodata_enabled)
- return;
-
- set_vm_flush_reset_perms(mod->core_layout.base);
- set_vm_flush_reset_perms(mod->init_layout.base);
- frob_text(&mod->core_layout, set_memory_ro);
-
- frob_rodata(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- frob_rodata(&mod->init_layout, set_memory_ro);
-
- if (after_init)
- frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_nx);
- frob_ro_after_init(&mod->core_layout, set_memory_nx);
- frob_writable_data(&mod->core_layout, set_memory_nx);
- frob_rodata(&mod->init_layout, set_memory_nx);
- frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
- int i;
-
- for (i = 0; i < hdr->e_shnum; i++) {
- if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
- pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
- mod->name, secstrings + sechdrs[i].sh_name, i);
- return -ENOEXEC;
- }
- }
-
- return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- unsigned int size, symndx;
- int ret;
-
- size = sizeof(*mod->klp_info);
- mod->klp_info = kmalloc(size, GFP_KERNEL);
- if (mod->klp_info == NULL)
- return -ENOMEM;
-
- /* Elf header */
- size = sizeof(mod->klp_info->hdr);
- memcpy(&mod->klp_info->hdr, info->hdr, size);
-
- /* Elf section header table */
- size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
- mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
- if (mod->klp_info->sechdrs == NULL) {
- ret = -ENOMEM;
- goto free_info;
- }
-
- /* Elf section name string table */
- size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
- mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
- if (mod->klp_info->secstrings == NULL) {
- ret = -ENOMEM;
- goto free_sechdrs;
- }
-
- /* Elf symbol section index */
- symndx = info->index.sym;
- mod->klp_info->symndx = symndx;
-
- /*
- * For livepatch modules, core_kallsyms.symtab is a complete
- * copy of the original symbol table. Adjust sh_addr to point
- * to core_kallsyms.symtab since the copy of the symtab in module
- * init memory is freed at the end of do_init_module().
- */
- mod->klp_info->sechdrs[symndx].sh_addr = \
- (unsigned long) mod->core_kallsyms.symtab;
-
- return 0;
-
-free_sechdrs:
- kfree(mod->klp_info->sechdrs);
-free_info:
- kfree(mod->klp_info);
- return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
- kfree(mod->klp_info->sechdrs);
- kfree(mod->klp_info->secstrings);
- kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
-}
-#endif /* CONFIG_LIVEPATCH */
-
-void __weak module_memfree(void *module_region)
-{
- /*
- * This memory may be RO, and freeing RO memory in an interrupt is not
- * supported by vmalloc.
- */
- WARN_ON(in_interrupt());
- vfree(module_region);
-}
-
-void __weak module_arch_cleanup(struct module *mod)
-{
-}
-
-void __weak module_arch_freeing_init(struct module *mod)
-{
-}
-
-static void cfi_cleanup(struct module *mod);
-
-/* Free a module, remove from lists, etc. */
-static void free_module(struct module *mod)
-{
- trace_module_free(mod);
-
- mod_sysfs_teardown(mod);
-
- /*
- * We leave it in list to prevent duplicate loads, but make sure
- * that noone uses it while it's being deconstructed.
- */
- mutex_lock(&module_mutex);
- mod->state = MODULE_STATE_UNFORMED;
- mutex_unlock(&module_mutex);
-
- /* Remove dynamic debug info */
- ddebug_remove_module(mod->name);
-
- /* Arch-specific cleanup. */
- module_arch_cleanup(mod);
-
- /* Module unload stuff */
- module_unload_free(mod);
-
- /* Free any allocated parameters. */
- destroy_params(mod->kp, mod->num_kp);
-
- if (is_livepatch_module(mod))
- free_module_elf(mod);
-
- /* Now we can delete it from the lists */
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- mod_tree_remove(mod);
- /* Remove this module from bug list, this uses list_del_rcu */
- module_bug_cleanup(mod);
- /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
-
- /* Clean up CFI for the module. */
- cfi_cleanup(mod);
-
- /* This may be empty, but that's OK */
- module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- kfree(mod->args);
- percpu_modfree(mod);
-
- /* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
- /* Finally, free the core (containing the module structure) */
- module_memfree(mod->core_layout.base);
-}
-
-void *__symbol_get(const char *symbol)
-{
- struct find_symbol_arg fsa = {
- .name = symbol,
- .gplok = true,
- .warn = true,
- };
-
- preempt_disable();
- if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
- preempt_enable();
- return NULL;
- }
- preempt_enable();
- return (void *)kernel_symbol_value(fsa.sym);
-}
-EXPORT_SYMBOL_GPL(__symbol_get);
-
-/*
- * Ensure that an exported symbol [global namespace] does not already exist
- * in the kernel or in some other module's exported symbol table.
- *
- * You must hold the module_mutex.
- */
-static int verify_exported_symbols(struct module *mod)
-{
- unsigned int i;
- const struct kernel_symbol *s;
- struct {
- const struct kernel_symbol *sym;
- unsigned int num;
- } arr[] = {
- { mod->syms, mod->num_syms },
- { mod->gpl_syms, mod->num_gpl_syms },
- };
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- struct find_symbol_arg fsa = {
- .name = kernel_symbol_name(s),
- .gplok = true,
- };
- if (find_symbol(&fsa)) {
- pr_err("%s: exports duplicate symbol %s"
- " (owned by %s)\n",
- mod->name, kernel_symbol_name(s),
- module_name(fsa.owner));
- return -ENOEXEC;
- }
- }
- }
- return 0;
-}
-
-static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
-{
- /*
- * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
- * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
- * i386 has a similar problem but may not deserve a fix.
- *
- * If we ever have to ignore many symbols, consider refactoring the code to
- * only warn if referenced by a relocation.
- */
- if (emachine == EM_386 || emachine == EM_X86_64)
- return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
- return false;
-}
-
-/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(struct module *mod, const struct load_info *info)
-{
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- Elf_Sym *sym = (void *)symsec->sh_addr;
- unsigned long secbase;
- unsigned int i;
- int ret = 0;
- const struct kernel_symbol *ksym;
-
- for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
- const char *name = info->strtab + sym[i].st_name;
-
- switch (sym[i].st_shndx) {
- case SHN_COMMON:
- /* Ignore common symbols */
- if (!strncmp(name, "__gnu_lto", 9))
- break;
-
- /*
- * We compiled with -fno-common. These are not
- * supposed to happen.
- */
- pr_debug("Common symbol: %s\n", name);
- pr_warn("%s: please compile with -fno-common\n",
- mod->name);
- ret = -ENOEXEC;
- break;
-
- case SHN_ABS:
- /* Don't need to do anything */
- pr_debug("Absolute symbol: 0x%08lx\n",
- (long)sym[i].st_value);
- break;
-
- case SHN_LIVEPATCH:
- /* Livepatch symbols are resolved by livepatch */
- break;
-
- case SHN_UNDEF:
- ksym = resolve_symbol_wait(mod, info, name);
- /* Ok if resolved. */
- if (ksym && !IS_ERR(ksym)) {
- sym[i].st_value = kernel_symbol_value(ksym);
- break;
- }
-
- /* Ok if weak or ignored. */
- if (!ksym &&
- (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
- ignore_undef_symbol(info->hdr->e_machine, name)))
- break;
-
- ret = PTR_ERR(ksym) ?: -ENOENT;
- pr_warn("%s: Unknown symbol %s (err %d)\n",
- mod->name, name, ret);
- break;
-
- default:
- /* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == info->index.pcpu)
- secbase = (unsigned long)mod_percpu(mod);
- else
- secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
- sym[i].st_value += secbase;
- break;
- }
- }
-
- return ret;
-}
-
-static int apply_relocations(struct module *mod, const struct load_info *info)
-{
- unsigned int i;
- int err = 0;
-
- /* Now do relocations. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- unsigned int infosec = info->sechdrs[i].sh_info;
-
- /* Not a valid relocation section? */
- if (infosec >= info->hdr->e_shnum)
- continue;
-
- /* Don't bother with non-allocated sections */
- if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
- continue;
-
- if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
- err = klp_apply_section_relocs(mod, info->sechdrs,
- info->secstrings,
- info->strtab,
- info->index.sym, i,
- NULL);
- else if (info->sechdrs[i].sh_type == SHT_REL)
- err = apply_relocate(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- else if (info->sechdrs[i].sh_type == SHT_RELA)
- err = apply_relocate_add(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- if (err < 0)
- break;
- }
- return err;
-}
-
-/* Additional bytes needed by arch in front of individual sections */
-unsigned int __weak arch_mod_section_prepend(struct module *mod,
- unsigned int section)
-{
- /* default implementation just returns zero */
- return 0;
-}
-
-/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
- Elf_Shdr *sechdr, unsigned int section)
-{
- long ret;
-
- *size += arch_mod_section_prepend(mod, section);
- ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
- *size = ret + sechdr->sh_size;
- return ret;
-}
-
-static bool module_init_layout_section(const char *sname)
-{
-#ifndef CONFIG_MODULE_UNLOAD
- if (module_exit_section(sname))
- return true;
-#endif
- return module_init_section(sname);
-}
-
-/*
- * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- * might -- code, read-only data, read-write data, small data. Tally
- * sizes, and place the offsets into sh_entsize fields: high bit means it
- * belongs in init.
- */
-static void layout_sections(struct module *mod, struct load_info *info)
-{
- static unsigned long const masks[][2] = {
- /*
- * NOTE: all executable code must be the first section
- * in this array; otherwise modify the text_size
- * finder in the two loops below
- */
- { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
- { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
- { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
- { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
- { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
- };
- unsigned int m, i;
-
- for (i = 0; i < info->hdr->e_shnum; i++)
- info->sechdrs[i].sh_entsize = ~0UL;
-
- pr_debug("Core section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || module_init_layout_section(sname))
- continue;
- s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.text_size = mod->core_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_size = mod->core_layout.size;
- break;
- case 2: /* RO after init */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_after_init_size = mod->core_layout.size;
- break;
- case 4: /* whole core */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- break;
- }
- }
-
- pr_debug("Init section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || !module_init_layout_section(sname))
- continue;
- s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
- | INIT_OFFSET_MASK);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.text_size = mod->init_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.ro_size = mod->init_layout.size;
- break;
- case 2:
- /*
- * RO after init doesn't apply to init_layout (only
- * core_layout), so it just takes the value of ro_size.
- */
- mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
- break;
- case 4: /* whole init */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- break;
- }
- }
-}
-
-static void set_license(struct module *mod, const char *license)
-{
- if (!license)
- license = "unspecified";
-
- if (!license_is_gpl_compatible(license)) {
- if (!test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license '%s' taints kernel.\n",
- mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
- }
-}
-
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
- /* Skip non-zero chars */
- while (string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
-
- /* Skip any zero padding. */
- while (!string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
- return string;
-}
-
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev)
-{
- char *p;
- unsigned int taglen = strlen(tag);
- Elf_Shdr *infosec = &info->sechdrs[info->index.info];
- unsigned long size = infosec->sh_size;
-
- /*
- * get_modinfo() calls made before rewrite_section_headers()
- * must use sh_offset, as sh_addr isn't set!
- */
- char *modinfo = (char *)info->hdr + infosec->sh_offset;
-
- if (prev) {
- size -= prev - modinfo;
- modinfo = next_string(prev, &size);
- }
-
- for (p = modinfo; p; p = next_string(p, &size)) {
- if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
- return p + taglen + 1;
- }
- return NULL;
-}
-
-static char *get_modinfo(const struct load_info *info, const char *tag)
-{
- return get_next_modinfo(info, tag, NULL);
-}
-
-static void setup_modinfo(struct module *mod, struct load_info *info)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->setup)
- attr->setup(mod, get_modinfo(info, attr->attr.name));
- }
-}
-
-static void free_modinfo(struct module *mod)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->free)
- attr->free(mod);
- }
-}
-
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
-{
- return bsearch(name, start, stop - start,
- sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
- const struct module *mod)
-{
- const struct kernel_symbol *ks;
- if (!mod)
- ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
- else
- ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
- return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
- const Elf_Shdr *sechdrs = info->sechdrs;
-
- if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
- if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
- return 'v';
- else
- return 'w';
- }
- if (sym->st_shndx == SHN_UNDEF)
- return 'U';
- if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
- return 'a';
- if (sym->st_shndx >= SHN_LORESERVE)
- return '?';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
- return 't';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
- && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
- if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
- return 'r';
- else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 'g';
- else
- return 'd';
- }
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 's';
- else
- return 'b';
- }
- if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug")) {
- return 'n';
- }
- return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum, unsigned int pcpundx)
-{
- const Elf_Shdr *sec;
-
- if (src->st_shndx == SHN_UNDEF
- || src->st_shndx >= shnum
- || !src->st_name)
- return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
- if (src->st_shndx == pcpundx)
- return true;
-#endif
-
- sec = sechdrs + src->st_shndx;
- if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
- || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
- return false;
-
- return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep. This is simple, but has the effect of making multiple
- * copies of duplicates. We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *symsect = info->sechdrs + info->index.sym;
- Elf_Shdr *strsect = info->sechdrs + info->index.str;
- const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size = 0;
-
- /* Put symbol section at end of init part of module. */
- symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
- src = (void *)info->hdr + symsect->sh_offset;
- nsrc = symsect->sh_size / sizeof(*src);
-
- /* Compute total space required for the core symbols' strtab. */
- for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- strtab_size += strlen(&info->strtab[src[i].st_name])+1;
- ndst++;
- }
- }
-
- /* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_layout.size += strtab_size;
- info->core_typeoffs = mod->core_layout.size;
- mod->core_layout.size += ndst * sizeof(char);
- mod->core_layout.size = debug_align(mod->core_layout.size);
-
- /* Put string table section at end of init part of module. */
- strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* We'll tack temporary mod_kallsyms on the end. */
- mod->init_layout.size = ALIGN(mod->init_layout.size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_layout.size;
- mod->init_layout.size += sizeof(struct mod_kallsyms);
- info->init_typeoffs = mod->init_layout.size;
- mod->init_layout.size += nsrc * sizeof(char);
- mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section. Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
- unsigned int i, ndst;
- const Elf_Sym *src;
- Elf_Sym *dst;
- char *s;
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
- /* Set up to point into init section. */
- mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
- mod->kallsyms->symtab = (void *)symsec->sh_addr;
- mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
- /* Make sure we get permanent strtab: don't use info->strtab. */
- mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
- mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
- /*
- * Now populate the cut down core kallsyms for after init
- * and set types up while we still have access to sections.
- */
- mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
- mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
- src = mod->kallsyms->symtab;
- for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
- mod->kallsyms->typetab[i] = elf_type(src + i, info);
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- mod->core_kallsyms.typetab[ndst] =
- mod->kallsyms->typetab[i];
- dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
- }
- }
- mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
- const Elf_Shdr *sechdr;
- unsigned int i;
-
- for (i = 0; i < info->hdr->e_shnum; i++) {
- sechdr = &info->sechdrs[i];
- if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
- !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
- sechdr->sh_size))
- break;
- }
-}
-#else
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-}
-#endif
-
-static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
-{
- if (!debug)
- return;
- ddebug_add_module(debug, num, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
-{
- if (debug)
- ddebug_remove_module(mod->name);
-}
-
-void * __weak module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
-bool __weak module_init_section(const char *name)
-{
- return strstarts(name, ".init");
-}
-
-bool __weak module_exit_section(const char *name)
-{
- return strstarts(name, ".exit");
-}
-
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
- }
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
- int err = -ENODATA;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- const char *reason;
- const void *mod = info->hdr;
- bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
- MODULE_INIT_IGNORE_VERMAGIC);
- /*
- * Do not allow mangled modules as a module with version information
- * removed is no longer the module that was signed.
- */
- if (!mangled_module &&
- info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
- /* We truncate the module to discard the signature */
- info->len -= markerlen;
- err = mod_verify_sig(mod, info);
- if (!err) {
- info->sig_ok = true;
- return 0;
- }
- }
-
- /*
- * We don't permit modules to be loaded into the trusted kernels
- * without a valid signature on them, but if we're not enforcing,
- * certain errors are non-fatal.
- */
- switch (err) {
- case -ENODATA:
- reason = "unsigned module";
- break;
- case -ENOPKG:
- reason = "module with unsupported crypto";
- break;
- case -ENOKEY:
- reason = "module with unavailable key";
- break;
-
- default:
- /*
- * All other errors are fatal, including lack of memory,
- * unparseable signatures, and signature check failures --
- * even if signatures aren't required.
- */
- return err;
- }
-
- if (is_module_sig_enforced()) {
- pr_notice("Loading of %s is rejected\n", reason);
- return -EKEYREJECTED;
- }
-
- return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
- return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
-static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
-{
-#if defined(CONFIG_64BIT)
- unsigned long long secend;
-#else
- unsigned long secend;
-#endif
-
- /*
- * Check for both overflow and offset/size being
- * too large.
- */
- secend = shdr->sh_offset + shdr->sh_size;
- if (secend < shdr->sh_offset || secend > info->len)
- return -ENOEXEC;
-
- return 0;
-}
-
-/*
- * Sanity checks against invalid binaries, wrong arch, weird elf version.
- *
- * Also do basic validity checks against section offsets and sizes, the
- * section name string table, and the indices used for it (sh_name).
- */
-static int elf_validity_check(struct load_info *info)
-{
- unsigned int i;
- Elf_Shdr *shdr, *strhdr;
- int err;
-
- if (info->len < sizeof(*(info->hdr))) {
- pr_err("Invalid ELF header len %lu\n", info->len);
- goto no_exec;
- }
-
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
- pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
- goto no_exec;
- }
- if (info->hdr->e_type != ET_REL) {
- pr_err("Invalid ELF header type: %u != %u\n",
- info->hdr->e_type, ET_REL);
- goto no_exec;
- }
- if (!elf_check_arch(info->hdr)) {
- pr_err("Invalid architecture in ELF header: %u\n",
- info->hdr->e_machine);
- goto no_exec;
- }
- if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
- pr_err("Invalid ELF section header size\n");
- goto no_exec;
- }
-
- /*
- * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
- * known and small. So e_shnum * sizeof(Elf_Shdr)
- * will not overflow unsigned long on any platform.
- */
- if (info->hdr->e_shoff >= info->len
- || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff)) {
- pr_err("Invalid ELF section header overflow\n");
- goto no_exec;
- }
-
- info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
-
- /*
- * Verify if the section name table index is valid.
- */
- if (info->hdr->e_shstrndx == SHN_UNDEF
- || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
- pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
- info->hdr->e_shstrndx, info->hdr->e_shstrndx,
- info->hdr->e_shnum);
- goto no_exec;
- }
-
- strhdr = &info->sechdrs[info->hdr->e_shstrndx];
- err = validate_section_offset(info, strhdr);
- if (err < 0) {
- pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
- return err;
- }
-
- /*
- * The section name table must be NUL-terminated, as required
- * by the spec. This makes strcmp and pr_* calls that access
- * strings in the section safe.
- */
- info->secstrings = (void *)info->hdr + strhdr->sh_offset;
- if (info->secstrings[strhdr->sh_size - 1] != '\0') {
- pr_err("ELF Spec violation: section name table isn't null terminated\n");
- goto no_exec;
- }
-
- /*
- * The code assumes that section 0 has a length of zero and
- * an addr of zero, so check for it.
- */
- if (info->sechdrs[0].sh_type != SHT_NULL
- || info->sechdrs[0].sh_size != 0
- || info->sechdrs[0].sh_addr != 0) {
- pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
- info->sechdrs[0].sh_type);
- goto no_exec;
- }
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- shdr = &info->sechdrs[i];
- switch (shdr->sh_type) {
- case SHT_NULL:
- case SHT_NOBITS:
- continue;
- case SHT_SYMTAB:
- if (shdr->sh_link == SHN_UNDEF
- || shdr->sh_link >= info->hdr->e_shnum) {
- pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
- shdr->sh_link, shdr->sh_link,
- info->hdr->e_shnum);
- goto no_exec;
- }
- fallthrough;
- default:
- err = validate_section_offset(info, shdr);
- if (err < 0) {
- pr_err("Invalid ELF section in module (section %u type %u)\n",
- i, shdr->sh_type);
- return err;
- }
-
- if (shdr->sh_flags & SHF_ALLOC) {
- if (shdr->sh_name >= strhdr->sh_size) {
- pr_err("Invalid ELF section name in module (section %u type %u)\n",
- i, shdr->sh_type);
- return -ENOEXEC;
- }
- }
- break;
- }
- }
-
- return 0;
-
-no_exec:
- return -ENOEXEC;
-}
-
-#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
-
-static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
-{
- do {
- unsigned long n = min(len, COPY_CHUNK_SIZE);
-
- if (copy_from_user(dst, usrc, n) != 0)
- return -EFAULT;
- cond_resched();
- dst += n;
- usrc += n;
- len -= n;
- } while (len);
- return 0;
-}
-
-#ifdef CONFIG_LIVEPATCH
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- mod->klp = true;
- add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
- pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
- mod->name);
- }
-
- return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
- mod->name);
- return -ENOEXEC;
- }
-
- return 0;
-}
-#endif /* CONFIG_LIVEPATCH */
-
-static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
-{
- if (retpoline_module_ok(get_modinfo(info, "retpoline")))
- return;
-
- pr_warn("%s: loading module not compiled with retpoline compiler.\n",
- mod->name);
-}
-
-/* Sets info->hdr and info->len. */
-static int copy_module_from_user(const void __user *umod, unsigned long len,
- struct load_info *info)
-{
- int err;
-
- info->len = len;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
-
- err = security_kernel_load_data(LOADING_MODULE, true);
- if (err)
- return err;
-
- /* Suck in entire file: we'll want most of it. */
- info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
- if (!info->hdr)
- return -ENOMEM;
-
- if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
- err = -EFAULT;
- goto out;
- }
-
- err = security_kernel_post_load_data((char *)info->hdr, info->len,
- LOADING_MODULE, "init_module");
-out:
- if (err)
- vfree(info->hdr);
-
- return err;
-}
-
-static void free_copy(struct load_info *info, int flags)
-{
- if (flags & MODULE_INIT_COMPRESSED_FILE)
- module_decompress_cleanup(info);
- else
- vfree(info->hdr);
-}
-
-static int rewrite_section_headers(struct load_info *info, int flags)
-{
- unsigned int i;
-
- /* This should always be true, but let's be sure. */
- info->sechdrs[0].sh_addr = 0;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
-
- /*
- * Mark all sections sh_addr with their address in the
- * temporary image.
- */
- shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-
- }
-
- /* Track but don't keep modinfo and version sections. */
- info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
- info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- return 0;
-}
-
-/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Set info->mod to the temporary copy of the module in info->hdr. The final one
- * will be allocated in move_module().
- */
-static int setup_load_info(struct load_info *info, int flags)
-{
- unsigned int i;
-
- /* Try to find a name early so we can log errors with a module name */
- info->index.info = find_sec(info, ".modinfo");
- if (info->index.info)
- info->name = get_modinfo(info, "name");
-
- /* Find internal symbols and strings. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
- info->index.sym = i;
- info->index.str = info->sechdrs[i].sh_link;
- info->strtab = (char *)info->hdr
- + info->sechdrs[info->index.str].sh_offset;
- break;
- }
- }
-
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
- }
-
- info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
- if (!info->index.mod) {
- pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
- }
- /* This is temporary: point mod into copy of data. */
- info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
-
- /*
- * If we didn't load the .modinfo 'name' field earlier, fall back to
- * on-disk struct mod 'name' field.
- */
- if (!info->name)
- info->name = info->mod->name;
-
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
-
- info->index.pcpu = find_pcpusec(info);
-
- return 0;
-}
-
-static int check_modinfo(struct module *mod, struct load_info *info, int flags)
-{
- const char *modmagic = get_modinfo(info, "vermagic");
- int err;
-
- if (flags & MODULE_INIT_IGNORE_VERMAGIC)
- modmagic = NULL;
-
- /* This is allowed: modprobe --force will invalidate it. */
- if (!modmagic) {
- err = try_to_force_load(mod, "bad vermagic");
- if (err)
- return err;
- } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- pr_err("%s: version magic '%s' should be '%s'\n",
- info->name, modmagic, vermagic);
- return -ENOEXEC;
- }
-
- if (!get_modinfo(info, "intree")) {
- if (!test_taint(TAINT_OOT_MODULE))
- pr_warn("%s: loading out-of-tree module taints kernel.\n",
- mod->name);
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
- }
-
- check_modinfo_retpoline(mod, info);
-
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
- }
-
- err = check_modinfo_livepatch(mod, info);
- if (err)
- return err;
-
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(info, "license"));
-
- return 0;
-}
-
-static int find_module_sections(struct module *mod, struct load_info *info)
-{
- mod->kp = section_objs(info, "__param",
- sizeof(*mod->kp), &mod->num_kp);
- mod->syms = section_objs(info, "__ksymtab",
- sizeof(*mod->syms), &mod->num_syms);
- mod->crcs = section_addr(info, "__kcrctab");
- mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
-
-#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(info, ".ctors",
- sizeof(*mod->ctors), &mod->num_ctors);
- if (!mod->ctors)
- mod->ctors = section_objs(info, ".init_array",
- sizeof(*mod->ctors), &mod->num_ctors);
- else if (find_sec(info, ".init_array")) {
- /*
- * This shouldn't happen with same compiler and binutils
- * building all parts of the module.
- */
- pr_warn("%s: has both .ctors and .init_array.\n",
- mod->name);
- return -EINVAL;
- }
-#endif
-
- mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
- &mod->noinstr_text_size);
-
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
- sizeof(*mod->tracepoints_ptrs),
- &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_TREE_SRCU
- mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
- sizeof(*mod->srcu_struct_ptrs),
- &mod->num_srcu_structs);
-#endif
-#ifdef CONFIG_BPF_EVENTS
- mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
- sizeof(*mod->bpf_raw_events),
- &mod->num_bpf_raw_events);
-#endif
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
- mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
-#endif
-#ifdef CONFIG_JUMP_LABEL
- mod->jump_entries = section_objs(info, "__jump_table",
- sizeof(*mod->jump_entries),
- &mod->num_jump_entries);
-#endif
-#ifdef CONFIG_EVENT_TRACING
- mod->trace_events = section_objs(info, "_ftrace_events",
- sizeof(*mod->trace_events),
- &mod->num_trace_events);
- mod->trace_evals = section_objs(info, "_ftrace_eval_map",
- sizeof(*mod->trace_evals),
- &mod->num_trace_evals);
-#endif
-#ifdef CONFIG_TRACING
- mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
- sizeof(*mod->trace_bprintk_fmt_start),
- &mod->num_trace_bprintk_fmt);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
- /* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
- sizeof(*mod->ftrace_callsites),
- &mod->num_ftrace_callsites);
-#endif
-#ifdef CONFIG_FUNCTION_ERROR_INJECTION
- mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
- sizeof(*mod->ei_funcs),
- &mod->num_ei_funcs);
-#endif
-#ifdef CONFIG_KPROBES
- mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
- &mod->kprobes_text_size);
- mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
- sizeof(unsigned long),
- &mod->num_kprobe_blacklist);
-#endif
-#ifdef CONFIG_PRINTK_INDEX
- mod->printk_index_start = section_objs(info, ".printk_index",
- sizeof(*mod->printk_index_start),
- &mod->printk_index_size);
-#endif
-#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
- mod->static_call_sites = section_objs(info, ".static_call_sites",
- sizeof(*mod->static_call_sites),
- &mod->num_static_call_sites);
-#endif
- mod->extable = section_objs(info, "__ex_table",
- sizeof(*mod->extable), &mod->num_exentries);
-
- if (section_addr(info, "__obsparm"))
- pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
-
- info->debug = section_objs(info, "__dyndbg",
- sizeof(*info->debug), &info->num_debug);
-
- return 0;
-}
-
-static int move_module(struct module *mod, struct load_info *info)
-{
- int i;
- void *ptr;
-
- /* Do the allocs. */
- ptr = module_alloc(mod->core_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr)
- return -ENOMEM;
-
- memset(ptr, 0, mod->core_layout.size);
- mod->core_layout.base = ptr;
-
- if (mod->init_layout.size) {
- ptr = module_alloc(mod->init_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr) {
- module_memfree(mod->core_layout.base);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_layout.size);
- mod->init_layout.base = ptr;
- } else
- mod->init_layout.base = NULL;
-
- /* Transfer each section which specifies SHF_ALLOC */
- pr_debug("final section addresses:\n");
- for (i = 0; i < info->hdr->e_shnum; i++) {
- void *dest;
- Elf_Shdr *shdr = &info->sechdrs[i];
-
- if (!(shdr->sh_flags & SHF_ALLOC))
- continue;
-
- if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->init_layout.base
- + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
- else
- dest = mod->core_layout.base + shdr->sh_entsize;
-
- if (shdr->sh_type != SHT_NOBITS)
- memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
- /* Update sh_addr to point to copy in image. */
- shdr->sh_addr = (unsigned long)dest;
- pr_debug("\t0x%lx %s\n",
- (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
- }
-
- return 0;
-}
-
-static int check_module_license_and_versions(struct module *mod)
-{
- int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
-
- /*
- * ndiswrapper is under GPL by itself, but loads proprietary modules.
- * Don't use add_taint_module(), as it would prevent ndiswrapper from
- * using GPL-only symbols it needs.
- */
- if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license taints kernel.\n", mod->name);
-
-#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs) ||
- (mod->num_gpl_syms && !mod->gpl_crcs)) {
- return try_to_force_load(mod,
- "no versions for exported symbols");
- }
-#endif
- return 0;
-}
-
-static void flush_module_icache(const struct module *mod)
-{
- /*
- * Flush the instruction cache, since we've played with text.
- * Do it before processing of module parameters, so the module
- * can provide parameter accessor functions of its own.
- */
- if (mod->init_layout.base)
- flush_icache_range((unsigned long)mod->init_layout.base,
- (unsigned long)mod->init_layout.base
- + mod->init_layout.size);
- flush_icache_range((unsigned long)mod->core_layout.base,
- (unsigned long)mod->core_layout.base + mod->core_layout.size);
-}
-
-int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-/* module_blacklist is a comma-separated list of module names */
-static char *module_blacklist;
-static bool blacklisted(const char *module_name)
-{
- const char *p;
- size_t len;
-
- if (!module_blacklist)
- return false;
-
- for (p = module_blacklist; *p; p += len) {
- len = strcspn(p, ",");
- if (strlen(module_name) == len && !memcmp(module_name, p, len))
- return true;
- if (p[len] == ',')
- len++;
- }
- return false;
-}
-core_param(module_blacklist, module_blacklist, charp, 0400);
-
-static struct module *layout_and_allocate(struct load_info *info, int flags)
-{
- struct module *mod;
- unsigned int ndx;
- int err;
-
- err = check_modinfo(info->mod, info, flags);
- if (err)
- return ERR_PTR(err);
-
- /* Allow arches to frob section contents and sizes. */
- err = module_frob_arch_sections(info->hdr, info->sechdrs,
- info->secstrings, info->mod);
- if (err < 0)
- return ERR_PTR(err);
-
- err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
- info->secstrings, info->mod);
- if (err < 0)
- return ERR_PTR(err);
-
- /* We will do a special allocation for per-cpu sections later. */
- info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- /*
- * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
- * layout_sections() can put it in the right place.
- * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
- */
- ndx = find_sec(info, ".data..ro_after_init");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
- /*
- * Mark the __jump_table section as ro_after_init as well: these data
- * structures are never modified, with the exception of entries that
- * refer to code in the __init section, which are annotated as such
- * at module load time.
- */
- ndx = find_sec(info, "__jump_table");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-
- /*
- * Determine total sizes, and put offsets in sh_entsize. For now
- * this is done generically; there doesn't appear to be any
- * special cases for the architectures.
- */
- layout_sections(info->mod, info);
- layout_symtab(info->mod, info);
-
- /* Allocate and move to the final place */
- err = move_module(info->mod, info);
- if (err)
- return ERR_PTR(err);
-
- /* Module has been copied to its final place now: return it. */
- mod = (void *)info->sechdrs[info->index.mod].sh_addr;
- kmemleak_load_module(mod, info);
- return mod;
-}
-
-/* mod is no longer valid after this! */
-static void module_deallocate(struct module *mod, struct load_info *info)
-{
- percpu_modfree(mod);
- module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- module_memfree(mod->core_layout.base);
-}
-
-int __weak module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
-static int post_relocation(struct module *mod, const struct load_info *info)
-{
- /* Sort exception table now relocations are done. */
- sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
- /* Copy relocated percpu area over. */
- percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
- info->sechdrs[info->index.pcpu].sh_size);
-
- /* Setup kallsyms-specific fields. */
- add_kallsyms(mod, info);
-
- /* Arch-specific module finalizing. */
- return module_finalize(info->hdr, info->sechdrs, mod);
-}
-
-/* Is this module of this name done loading? No locks held. */
-static bool finished_loading(const char *name)
-{
- struct module *mod;
- bool ret;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
- mutex_unlock(&module_mutex);
-
- return ret;
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
-}
-
-/* For freeing module_init on success, in case kallsyms traversing */
-struct mod_initfree {
- struct llist_node node;
- void *module_init;
-};
-
-static void do_free_init(struct work_struct *w)
-{
- struct llist_node *pos, *n, *list;
- struct mod_initfree *initfree;
-
- list = llist_del_all(&init_free_list);
-
- synchronize_rcu();
-
- llist_for_each_safe(pos, n, list) {
- initfree = container_of(pos, struct mod_initfree, node);
- module_memfree(initfree->module_init);
- kfree(initfree);
- }
-}
-
-/*
- * This is where the real work happens.
- *
- * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
- * helper command 'lx-symbols'.
- */
-static noinline int do_init_module(struct module *mod)
-{
- int ret = 0;
- struct mod_initfree *freeinit;
-
- freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
- if (!freeinit) {
- ret = -ENOMEM;
- goto fail;
- }
- freeinit->module_init = mod->init_layout.base;
-
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
-
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- goto fail_free_freeinit;
- }
- if (ret > 0) {
- pr_warn("%s: '%s'->init suspiciously returned %d, it should "
- "follow 0/-E convention\n"
- "%s: loading module anyway...\n",
- __func__, mod->name, ret, __func__);
- dump_stack();
- }
-
- /* Now it's a first class citizen! */
- mod->state = MODULE_STATE_LIVE;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
-
- /* Delay uevent until module has finished its init routine */
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
-
- /*
- * We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
- *
- * http://thread.gmane.org/gmane.linux.kernel/1420814
- */
- if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
- async_synchronize_full();
-
- ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
- mod->init_layout.size);
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- /* Switch to core kallsyms now init is done: kallsyms may be walking! */
- rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
-#endif
- module_enable_ro(mod, true);
- mod_tree_remove_init(mod);
- module_arch_freeing_init(mod);
- mod->init_layout.base = NULL;
- mod->init_layout.size = 0;
- mod->init_layout.ro_size = 0;
- mod->init_layout.ro_after_init_size = 0;
- mod->init_layout.text_size = 0;
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
- /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
- mod->btf_data = NULL;
-#endif
- /*
- * We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths, we
- * call synchronize_rcu(), but we don't want to slow down the success
- * path. module_memfree() cannot be called in an interrupt, so do the
- * work and call synchronize_rcu() in a work queue.
- *
- * Note that module_alloc() on most architectures creates W+X page
- * mappings which won't be cleaned up until do_free_init() runs. Any
- * code such as mark_rodata_ro() which depends on those mappings to
- * be cleaned up needs to sync with the queued work - ie
- * rcu_barrier()
- */
- if (llist_add(&freeinit->node, &init_free_list))
- schedule_work(&init_free_wq);
-
- mutex_unlock(&module_mutex);
- wake_up_all(&module_wq);
-
- return 0;
-
-fail_free_freeinit:
- kfree(freeinit);
-fail:
- /* Try to protect us from buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_rcu();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- ftrace_release_mod(mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
-}
-
-static int may_init_module(void)
-{
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- return 0;
-}
-
-/*
- * We try to place it in the list now to make sure it's unique before
- * we dedicate too many resources. In particular, temporary percpu
- * memory exhaustion.
- */
-static int add_unformed_module(struct module *mod)
-{
- int err;
- struct module *old;
-
- mod->state = MODULE_STATE_UNFORMED;
-
-again:
- mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name), true);
- if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto out_unlocked;
- goto again;
- }
- err = -EEXIST;
- goto out;
- }
- mod_update_bounds(mod);
- list_add_rcu(&mod->list, &modules);
- mod_tree_insert(mod);
- err = 0;
-
-out:
- mutex_unlock(&module_mutex);
-out_unlocked:
- return err;
-}
-
-static int complete_formation(struct module *mod, struct load_info *info)
-{
- int err;
-
- mutex_lock(&module_mutex);
-
- /* Find duplicate symbols (must be called under lock). */
- err = verify_exported_symbols(mod);
- if (err < 0)
- goto out;
-
- /* This relies on module_mutex for list integrity. */
- module_bug_finalize(info->hdr, info->sechdrs, mod);
-
- module_enable_ro(mod, false);
- module_enable_nx(mod);
- module_enable_x(mod);
-
- /*
- * Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us.
- */
- mod->state = MODULE_STATE_COMING;
- mutex_unlock(&module_mutex);
-
- return 0;
-
-out:
- mutex_unlock(&module_mutex);
- return err;
-}
-
-static int prepare_coming_module(struct module *mod)
-{
- int err;
-
- ftrace_module_enable(mod);
- err = klp_module_coming(mod);
- if (err)
- return err;
-
- err = blocking_notifier_call_chain_robust(&module_notify_list,
- MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
- err = notifier_to_errno(err);
- if (err)
- klp_module_going(mod);
-
- return err;
-}
-
-static int unknown_module_param_cb(char *param, char *val, const char *modname,
- void *arg)
-{
- struct module *mod = arg;
- int ret;
-
- if (strcmp(param, "async_probe") == 0) {
- mod->async_probe_requested = true;
- return 0;
- }
-
- /* Check for magic 'dyndbg' arg */
- ret = ddebug_dyndbg_module_param_cb(param, val, modname);
- if (ret != 0)
- pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
- return 0;
-}
-
-static void cfi_init(struct module *mod);
-
-/*
- * Allocate and load the module: note that size of section 0 is always
- * zero, and we rely on this for optional sections.
- */
-static int load_module(struct load_info *info, const char __user *uargs,
- int flags)
-{
- struct module *mod;
- long err = 0;
- char *after_dashes;
-
- /*
- * Do the signature check (if any) first. All that
- * the signature check needs is info->len, it does
- * not need any of the section info. That can be
- * set up later. This will minimize the chances
- * of a corrupt module causing problems before
- * we even get to the signature check.
- *
- * The check will also adjust info->len by stripping
- * off the sig length at the end of the module, making
- * checks against info->len more correct.
- */
- err = module_sig_check(info, flags);
- if (err)
- goto free_copy;
-
- /*
- * Do basic sanity checks against the ELF header and
- * sections.
- */
- err = elf_validity_check(info);
- if (err)
- goto free_copy;
-
- /*
- * Everything checks out, so set up the section info
- * in the info structure.
- */
- err = setup_load_info(info, flags);
- if (err)
- goto free_copy;
-
- /*
- * Now that we know we have the correct module name, check
- * if it's blacklisted.
- */
- if (blacklisted(info->name)) {
- err = -EPERM;
- pr_err("Module %s is blacklisted\n", info->name);
- goto free_copy;
- }
-
- err = rewrite_section_headers(info, flags);
- if (err)
- goto free_copy;
-
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info, info->mod)) {
- err = -ENOEXEC;
- goto free_copy;
- }
-
- /* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(info, flags);
- if (IS_ERR(mod)) {
- err = PTR_ERR(mod);
- goto free_copy;
- }
-
- audit_log_kern_module(mod->name);
-
- /* Reserve our place in the list. */
- err = add_unformed_module(mod);
- if (err)
- goto free_module;
-
-#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
- add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
- }
-#endif
-
- /* To avoid stressing percpu allocator, do this once we're unique. */
- err = percpu_modalloc(mod, info);
- if (err)
- goto unlink_mod;
-
- /* Now module is in final location, initialize linked lists, etc. */
- err = module_unload_init(mod);
- if (err)
- goto unlink_mod;
-
- init_param_lock(mod);
-
- /*
- * Now we've got everything in the final locations, we can
- * find optional sections.
- */
- err = find_module_sections(mod, info);
- if (err)
- goto free_unload;
-
- err = check_module_license_and_versions(mod);
- if (err)
- goto free_unload;
-
- /* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, info);
-
- /* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = apply_relocations(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = post_relocation(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- flush_module_icache(mod);
-
- /* Setup CFI for the module. */
- cfi_init(mod);
-
- /* Now copy in args */
- mod->args = strndup_user(uargs, ~0UL >> 1);
- if (IS_ERR(mod->args)) {
- err = PTR_ERR(mod->args);
- goto free_arch_cleanup;
- }
-
- init_build_id(mod, info);
- dynamic_debug_setup(mod, info->debug, info->num_debug);
-
- /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
- ftrace_module_init(mod);
-
- /* Finally it's fully formed, ready to start executing. */
- err = complete_formation(mod, info);
- if (err)
- goto ddebug_cleanup;
-
- err = prepare_coming_module(mod);
- if (err)
- goto bug_cleanup;
-
- /* Module is ready to execute: parsing args may do that. */
- after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, mod,
- unknown_module_param_cb);
- if (IS_ERR(after_dashes)) {
- err = PTR_ERR(after_dashes);
- goto coming_cleanup;
- } else if (after_dashes) {
- pr_warn("%s: parameters '%s' after `--' ignored\n",
- mod->name, after_dashes);
- }
-
- /* Link in to sysfs. */
- err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
- if (err < 0)
- goto coming_cleanup;
-
- if (is_livepatch_module(mod)) {
- err = copy_module_elf(mod, info);
- if (err < 0)
- goto sysfs_cleanup;
- }
-
- /* Get rid of temporary copy. */
- free_copy(info, flags);
-
- /* Done! */
- trace_module_load(mod);
-
- return do_init_module(mod);
-
- sysfs_cleanup:
- mod_sysfs_teardown(mod);
- coming_cleanup:
- mod->state = MODULE_STATE_GOING;
- destroy_params(mod->kp, mod->num_kp);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- bug_cleanup:
- mod->state = MODULE_STATE_GOING;
- /* module_bug_cleanup needs module_mutex protection */
- mutex_lock(&module_mutex);
- module_bug_cleanup(mod);
- mutex_unlock(&module_mutex);
-
- ddebug_cleanup:
- ftrace_release_mod(mod);
- dynamic_debug_remove(mod, info->debug);
- synchronize_rcu();
- kfree(mod->args);
- free_arch_cleanup:
- cfi_cleanup(mod);
- module_arch_cleanup(mod);
- free_modinfo:
- free_modinfo(mod);
- free_unload:
- module_unload_free(mod);
- unlink_mod:
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- mod_tree_remove(mod);
- wake_up_all(&module_wq);
- /* Wait for RCU-sched synchronizing before releasing mod->list. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
- free_module:
- /* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
- module_deallocate(mod, info);
- free_copy:
- free_copy(info, flags);
- return err;
-}
-
-SYSCALL_DEFINE3(init_module, void __user *, umod,
- unsigned long, len, const char __user *, uargs)
-{
- int err;
- struct load_info info = { };
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
-
- err = copy_module_from_user(umod, len, &info);
- if (err)
- return err;
-
- return load_module(&info, uargs, 0);
-}
-
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
-{
- struct load_info info = { };
- void *buf = NULL;
- int len;
- int err;
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
-
- if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC
- |MODULE_INIT_COMPRESSED_FILE))
- return -EINVAL;
-
- len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
- READING_MODULE);
- if (len < 0)
- return len;
-
- if (flags & MODULE_INIT_COMPRESSED_FILE) {
- err = module_decompress(&info, buf, len);
- vfree(buf); /* compressed data is no longer needed */
- if (err)
- return err;
- } else {
- info.hdr = buf;
- info.len = len;
- }
-
- return load_module(&info, uargs, flags);
-}
-
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
- return ((void *)addr >= start && (void *)addr < start + size);
-}
-
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
- return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
-{
- unsigned int i, best = 0;
- unsigned long nextval, bestval;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- /* At worse, next value is at end of module */
- if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
- else
- nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
- bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
- /*
- * Scan for closest preceding symbol, and next symbol. (ELF
- * starts real symbols at 1).
- */
- for (i = 1; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
- unsigned long thisval = kallsyms_symbol_value(sym);
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- /*
- * We ignore unnamed symbols: they're uninformative
- * and inserted at a whim.
- */
- if (*kallsyms_symbol_name(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
- continue;
-
- if (thisval <= addr && thisval > bestval) {
- best = i;
- bestval = thisval;
- }
- if (thisval > addr && thisval < nextval)
- nextval = thisval;
- }
-
- if (!best)
- return NULL;
-
- if (size)
- *size = nextval - bestval;
- if (offset)
- *offset = addr - bestval;
-
- return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
- void *ptr)
-{
- return ptr;
-}
-
-/*
- * For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
- */
-const char *module_address_lookup(unsigned long addr,
- unsigned long *size,
- unsigned long *offset,
- char **modname,
- const unsigned char **modbuildid,
- char *namebuf)
-{
- const char *ret = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (mod) {
- if (modname)
- *modname = mod->name;
- if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- *modbuildid = mod->build_id;
-#else
- *modbuildid = NULL;
-#endif
- }
-
- ret = find_kallsyms_symbol(mod, addr, size, offset);
- }
- /* Make a copy in here where it's safe */
- if (ret) {
- strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
- ret = namebuf;
- }
- preempt_enable();
-
- return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
- if (!sym)
- goto out;
-
- strlcpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strlcpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strlcpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name, char *module_name, int *exported)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- struct mod_kallsyms *kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- kallsyms = rcu_dereference_sched(mod->kallsyms);
- if (symnum < kallsyms->num_symtab) {
- const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
- *value = kallsyms_symbol_value(sym);
- *type = kallsyms->typetab[symnum];
- strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
- strlcpy(module_name, mod->name, MODULE_NAME_LEN);
- *exported = is_exported(name, *value, mod);
- preempt_enable();
- return 0;
- }
- symnum -= kallsyms->num_symtab;
- }
- preempt_enable();
- return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
- unsigned int i;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
- sym->st_shndx != SHN_UNDEF)
- return kallsyms_symbol_value(sym);
- }
- return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
- struct module *mod;
- char *colon;
- unsigned long ret = 0;
-
- /* Don't lock: we're in enough trouble already. */
- preempt_disable();
- if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
- if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = find_kallsyms_symbol_value(mod, colon+1);
- } else {
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
- break;
- }
- }
- preempt_enable();
- return ret;
-}
-
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
- struct module *, unsigned long),
- void *data)
-{
- struct module *mod;
- unsigned int i;
- int ret = 0;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(mod, &modules, list) {
- /* We hold module_mutex: no need for rcu_dereference_sched */
- struct mod_kallsyms *kallsyms = mod->kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- ret = fn(data, kallsyms_symbol_name(kallsyms, i),
- mod, kallsyms_symbol_value(sym));
- if (ret != 0)
- goto out;
-
- cond_resched();
- }
- }
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-#endif /* CONFIG_LIVEPATCH */
-#endif /* CONFIG_KALLSYMS */
-
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- initcall_t *init;
- exitcall_t *exit;
-
- rcu_read_lock_sched();
- mod->cfi_check = (cfi_check_fn)
- find_kallsyms_symbol_value(mod, "__cfi_check");
- init = (initcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
- exit = (exitcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
- rcu_read_unlock_sched();
-
- /* Fix init/exit functions to point to the CFI jump table */
- if (init)
- mod->init = *init;
-#ifdef CONFIG_MODULE_UNLOAD
- if (exit)
- mod->exit = *exit;
-#endif
-
- cfi_module_add(mod, module_addr_min);
-#endif
-}
-
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- cfi_module_remove(mod, module_addr_min);
-#endif
-}
-
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
-/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
-{
- int bx = 0;
-
- BUG_ON(mod->state == MODULE_STATE_UNFORMED);
- if (mod->taints ||
- mod->state == MODULE_STATE_GOING ||
- mod->state == MODULE_STATE_COMING) {
- buf[bx++] = '(';
- bx += module_flags_taint(mod, buf + bx);
- /* Show a - for module-is-being-unloaded */
- if (mod->state == MODULE_STATE_GOING)
- buf[bx++] = '-';
- /* Show a + for module-is-being-loaded */
- if (mod->state == MODULE_STATE_COMING)
- buf[bx++] = '+';
- buf[bx++] = ')';
- }
- buf[bx] = '\0';
-
- return buf;
-}
-
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
- struct module *mod = list_entry(p, struct module, list);
- char buf[MODULE_FLAGS_BUF_SIZE];
- void *value;
-
- /* We always ignore unformed modules. */
- if (mod->state == MODULE_STATE_UNFORMED)
- return 0;
-
- seq_printf(m, "%s %u",
- mod->name, mod->init_layout.size + mod->core_layout.size);
- print_unload_info(m, mod);
-
- /* Informative for users. */
- seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading" :
- mod->state == MODULE_STATE_COMING ? "Loading" :
- "Live");
- /* Used by oprofile and other similar tools. */
- value = m->private ? NULL : mod->core_layout.base;
- seq_printf(m, " 0x%px", value);
-
- /* Taints info */
- if (mod->taints)
- seq_printf(m, " %s", module_flags(mod, buf));
-
- seq_puts(m, "\n");
- return 0;
-}
-
-/*
- * Format: modulename size refcount deps address
- *
- * Where refcount is a number or -, and deps is a comma-separated list
- * of depends or -.
- */
-static const struct seq_operations modules_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
- int err = seq_open(file, &modules_op);
-
- if (!err) {
- struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
- }
-
- return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
- .proc_flags = PROC_ENTRY_PERMANENT,
- .proc_open = modules_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
- proc_create("modules", 0, NULL, &modules_proc_ops);
- return 0;
-}
-module_init(proc_modules_init);
-#endif
-
-/* Given an address, look for it in the module exception tables. */
-const struct exception_table_entry *search_module_extables(unsigned long addr)
-{
- const struct exception_table_entry *e = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (!mod)
- goto out;
-
- if (!mod->num_exentries)
- goto out;
-
- e = search_extable(mod->extable,
- mod->num_exentries,
- addr);
-out:
- preempt_enable();
-
- /*
- * Now, if we found one, we are running inside it now, hence
- * we cannot unload the module, hence no refcnt needed.
- */
- return e;
-}
-
-/**
- * is_module_address() - is this address inside a module?
- * @addr: the address to check.
- *
- * See is_module_text_address() if you simply want to see if the address
- * is code (not data).
- */
-bool is_module_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/**
- * __module_address() - get the module which contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_address(unsigned long addr)
-{
- struct module *mod;
-
- if (addr < module_addr_min || addr > module_addr_max)
- return NULL;
-
- module_assert_mutex_or_preempt();
-
- mod = mod_find(addr);
- if (mod) {
- BUG_ON(!within_module(addr, mod));
- if (mod->state == MODULE_STATE_UNFORMED)
- mod = NULL;
- }
- return mod;
-}
-
-/**
- * is_module_text_address() - is this address inside module code?
- * @addr: the address to check.
- *
- * See is_module_address() if you simply want to see if the address is
- * anywhere in a module. See kernel_text_address() for testing if an
- * address corresponds to kernel or module code.
- */
-bool is_module_text_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_text_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/**
- * __module_text_address() - get the module whose code contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_text_address(unsigned long addr)
-{
- struct module *mod = __module_address(addr);
- if (mod) {
- /* Make sure it's within the text section. */
- if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
- && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
- mod = NULL;
- }
- return mod;
-}
-
-/* Don't grab lock, we're oopsing. */
-void print_modules(void)
-{
- struct module *mod;
- char buf[MODULE_FLAGS_BUF_SIZE];
-
- printk(KERN_DEFAULT "Modules linked in:");
- /* Most callers should already have preempt disabled, but make sure */
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- pr_cont(" %s%s", mod->name, module_flags(mod, buf));
- }
- preempt_enable();
- if (last_unloaded_module[0])
- pr_cont(" [last unloaded: %s]", last_unloaded_module);
- pr_cont("\n");
-}
-
-#ifdef CONFIG_MODVERSIONS
-/*
- * Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module.
- */
-void module_layout(struct module *mod,
- struct modversion_info *ver,
- struct kernel_param *kp,
- struct kernel_symbol *ks,
- struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
new file mode 100644
index 000000000000..2a1beebf1d37
--- /dev/null
+++ b/kernel/module/Kconfig
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig MODULES
+ bool "Enable loadable module support"
+ modules
+ select EXECMEM
+ help
+ Kernel modules are small pieces of compiled code which can
+ be inserted in the running kernel, rather than being
+ permanently built into the kernel. You use the "modprobe"
+ tool to add (and sometimes remove) them. If you say Y here,
+ many parts of the kernel can be built as modules (by
+ answering M instead of Y where indicated): this is most
+ useful for infrequently used options which are not required
+ for booting. For more information, see the man pages for
+ modprobe, lsmod, modinfo, insmod and rmmod.
+
+ If you say Y here, you will need to run "make
+ modules_install" to put the modules under /lib/modules/
+ where modprobe can find them (you may need to be root to do
+ this).
+
+ If unsure, say Y.
+
+if MODULES
+
+config MODULE_DEBUGFS
+ bool
+
+config MODULE_DEBUG
+ bool "Module debugging"
+ depends on DEBUG_FS
+ help
+ Allows you to enable / disable features which can help you debug
+ modules. You don't need these options on production systems.
+
+if MODULE_DEBUG
+
+config MODULE_STATS
+ bool "Module statistics"
+ depends on DEBUG_FS
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of module statistics.
+ For example, size of all modules, average size, text size, a list
+ of failed modules and the size for each of those. For failed
+ modules we keep track of modules which failed due to either the
+ existing module taking too long to load or that module was already
+ loaded.
+
+ You should enable this if you are debugging production loads
+ and want to see if userspace or the kernel is doing stupid things
+ with loading modules when it shouldn't or if you want to help
+ optimize userspace / kernel space module autoloading schemes.
+ You might want to do this because failed modules tend to use
+ up significant amount of memory, and so you'd be doing everyone a
+ favor in avoiding these failures proactively.
+
+ This functionality is also useful for those experimenting with
+ module .text ELF section optimization.
+
+ If unsure, say N.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS
+ bool "Debug duplicate modules with auto-loading"
+ help
+ Module autoloading allows in-kernel code to request modules through
+ the *request_module*() API calls. This in turn just calls userspace
+ modprobe. Although modprobe checks to see if a module is already
+ loaded before trying to load a module there is a small time window in
+ which multiple duplicate requests can end up in userspace and multiple
+ modprobe calls race calling finit_module() around the same time for
+ duplicate modules. The finit_module() system call can consume in the
+ worst case more than twice the respective module size in virtual
+ memory for each duplicate module requests. Although duplicate module
+ requests are non-fatal virtual memory is a limited resource and each
+ duplicate module request ends up just unnecessarily straining virtual
+ memory.
+
+ This debugging facility will create pr_warn() splats for duplicate
+ module requests to help identify if module auto-loading may be the
+ culprit to your early boot virtual memory pressure. Since virtual
+ memory abuse caused by duplicate module requests could render a
+ system unusable this functionality will also converge races in
+ requests for the same module to a single request. You can boot with
+ the module.enable_dups_trace=1 kernel parameter to use WARN_ON()
+ instead of the pr_warn().
+
+ If the first module request used request_module_nowait() we cannot
+ use that as the anchor to wait for duplicate module requests, since
+ users of request_module() do want a proper return value. If a call
+ for the same module happened earlier with request_module() though,
+ then a duplicate request_module_nowait() would be detected. The
+ non-wait request_module() call is synchronous and waits until modprobe
+ completes. Subsequent auto-loading requests for the same module do
+ not trigger a new finit_module() calls and do not strain virtual
+ memory, and so as soon as modprobe successfully completes we remove
+ tracking for duplicates for that module.
+
+ Enable this functionality to try to debug virtual memory abuse during
+ boot on systems which are failing to boot or if you suspect you may be
+ straining virtual memory during boot, and you want to identify if the
+ abuse was due to module auto-loading. These issues are currently only
+ known to occur on systems with many CPUs (over 400) and is likely the
+ result of udev issuing duplicate module requests for each CPU, and so
+ module auto-loading is not the culprit. There may very well still be
+ many duplicate module auto-loading requests which could be optimized
+ for and this debugging facility can be used to help identify them.
+
+ Only enable this for debugging system functionality, never have it
+ enabled on real systems.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE
+ bool "Force full stack trace when duplicates are found"
+ depends on MODULE_DEBUG_AUTOLOAD_DUPS
+ help
+ Enabling this will force a full stack trace for duplicate module
+ auto-loading requests using WARN_ON() instead of pr_warn(). You
+ should keep this disabled at all times unless you are a developer
+ and are doing a manual inspection and want to debug exactly why
+ these duplicates occur.
+
+endif # MODULE_DEBUG
+
+config MODULE_FORCE_LOAD
+ bool "Forced module loading"
+ default n
+ help
+ Allow loading of modules without version information (ie. modprobe
+ --force). Forced module loading sets the 'F' (forced) taint flag and
+ is usually a really bad idea.
+
+config MODULE_UNLOAD
+ bool "Module unloading"
+ help
+ Without this option you will not be able to unload any
+ modules (note that some modules may not be unloadable
+ anyway), which makes your kernel smaller, faster
+ and simpler. If unsure, say Y.
+
+config MODULE_FORCE_UNLOAD
+ bool "Forced module unloading"
+ depends on MODULE_UNLOAD
+ help
+ This option allows you to force a module to unload, even if the
+ kernel believes it is unsafe: the kernel will remove the module
+ without waiting for anyone to stop using it (using the -f option to
+ rmmod). This is mainly for kernel developers and desperate users.
+ If unsure, say N.
+
+config MODULE_UNLOAD_TAINT_TRACKING
+ bool "Tainted module unload tracking"
+ depends on MODULE_UNLOAD
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of each unloaded
+ module that tainted the kernel. In addition to displaying a
+ list of linked (or loaded) modules e.g. on detection of a bad
+ page (see bad_page()), the aforementioned details are also
+ shown. If unsure, say N.
+
+config MODVERSIONS
+ bool "Module versioning support"
+ depends on !COMPILE_TEST
+ help
+ Usually, you have to use modules compiled with your kernel.
+ Saying Y here makes it sometimes possible to use modules
+ compiled for different kernels, by adding enough information
+ to the modules to (hopefully) spot any changes which would
+ make them incompatible with the kernel you are running. If
+ unsure, say N.
+
+choice
+ prompt "Module versioning implementation"
+ depends on MODVERSIONS
+ help
+ Select the tool used to calculate symbol versions for modules.
+
+ If unsure, select GENKSYMS.
+
+config GENKSYMS
+ bool "genksyms (from source code)"
+ help
+ Calculate symbol versions from pre-processed source code using
+ genksyms.
+
+ If unsure, say Y.
+
+config GENDWARFKSYMS
+ bool "gendwarfksyms (from debugging information)"
+ depends on DEBUG_INFO
+ # Requires full debugging information, split DWARF not supported.
+ depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT
+ # Requires ELF object files.
+ depends on !LTO
+ # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on
+ # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop
+ # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder:
+ # Verify 0 address DWARF variables are in ELF section").
+ depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129
+ help
+ Calculate symbol versions from DWARF debugging information using
+ gendwarfksyms. Requires DEBUG_INFO to be enabled.
+
+ If unsure, say N.
+endchoice
+
+config ASM_MODVERSIONS
+ bool
+ default HAVE_ASM_MODVERSIONS && MODVERSIONS
+ help
+ This enables module versioning for exported symbols also from
+ assembly. This can be enabled only when the target architecture
+ supports it.
+
+config EXTENDED_MODVERSIONS
+ bool "Extended Module Versioning Support"
+ depends on MODVERSIONS
+ help
+ This enables extended MODVERSIONs support, allowing long symbol
+ names to be versioned.
+
+ The most likely reason you would enable this is to enable Rust
+ support. If unsure, say N.
+
+config BASIC_MODVERSIONS
+ bool "Basic Module Versioning Support"
+ depends on MODVERSIONS
+ default y
+ help
+ This enables basic MODVERSIONS support, allowing older tools or
+ kernels to potentially load modules.
+
+ Disabling this may cause older `modprobe` or `kmod` to be unable
+ to read MODVERSIONS information from built modules. With this
+ disabled, older kernels may treat this module as unversioned.
+
+ This is enabled by default when MODVERSIONS are enabled.
+ If unsure, say Y.
+
+config MODULE_SRCVERSION_ALL
+ bool "Source checksum for all modules"
+ help
+ Modules which contain a MODULE_VERSION get an extra "srcversion"
+ field inserted into their modinfo section, which contains a
+ sum of the source files which made it. This helps maintainers
+ see exactly which source was used to build a module (since
+ others sometimes change the module source without updating
+ the version). With this option, such a "srcversion" field
+ will be created for all modules. If unsure, say N.
+
+config MODULE_SIG
+ bool "Module signature verification"
+ select MODULE_SIG_FORMAT
+ help
+ Check modules for valid signatures upon load: the signature
+ is simply appended to the module. For more information see
+ <file:Documentation/admin-guide/module-signing.rst>.
+
+ Note that this option adds the OpenSSL development packages as a
+ kernel build dependency so that the signing tool can use its crypto
+ library.
+
+ You should enable this option if you wish to use either
+ CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via
+ another LSM - otherwise unsigned modules will be loadable regardless
+ of the lockdown policy.
+
+ !!!WARNING!!! If you enable this option, you MUST make sure that the
+ module DOES NOT get stripped after being signed. This includes the
+ debuginfo strip done by some packagers (such as rpmbuild) and
+ inclusion into an initramfs that wants the module size reduced.
+
+config MODULE_SIG_FORCE
+ bool "Require modules to be validly signed"
+ depends on MODULE_SIG
+ help
+ Reject unsigned modules or signed modules for which we don't have a
+ key. Without this, such modules will simply taint the kernel.
+
+config MODULE_SIG_ALL
+ bool "Automatically sign all modules"
+ default y
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ Sign all modules during make modules_install. Without this option,
+ modules must be signed manually, using the scripts/sign-file tool.
+
+comment "Do not forget to sign required modules with scripts/sign-file"
+ depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
+
+choice
+ prompt "Hash algorithm to sign modules"
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
+ help
+ This determines which sort of hashing algorithm will be used during
+ signature generation. This algorithm _must_ be built into the kernel
+ directly so that signature verification can take place. It is not
+ possible to load a signed module containing the algorithm to check
+ the signature on that module.
+
+config MODULE_SIG_SHA1
+ bool "SHA-1"
+ select CRYPTO_SHA1
+
+config MODULE_SIG_SHA256
+ bool "SHA-256"
+ select CRYPTO_SHA256
+
+config MODULE_SIG_SHA384
+ bool "SHA-384"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA512
+ bool "SHA-512"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA3_256
+ bool "SHA3-256"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+ bool "SHA3-384"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+ bool "SHA3-512"
+ select CRYPTO_SHA3
+
+endchoice
+
+config MODULE_SIG_HASH
+ string
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
+ default "sha256" if MODULE_SIG_SHA256
+ default "sha384" if MODULE_SIG_SHA384
+ default "sha512" if MODULE_SIG_SHA512
+ default "sha3-256" if MODULE_SIG_SHA3_256
+ default "sha3-384" if MODULE_SIG_SHA3_384
+ default "sha3-512" if MODULE_SIG_SHA3_512
+
+config MODULE_COMPRESS
+ bool "Module compression"
+ help
+ Enable module compression to reduce on-disk size of module binaries.
+ This is fully compatible with signed modules.
+
+ The tool used to work with modules needs to support the selected
+ compression type. kmod MAY support gzip, xz and zstd. Other tools
+ might have a limited selection of the supported types.
+
+ Note that for modules inside an initrd or initramfs, it's more
+ efficient to compress the whole ramdisk instead.
+
+ If unsure, say N.
+
+choice
+ prompt "Module compression type"
+ depends on MODULE_COMPRESS
+ help
+ Choose the supported algorithm for module compression.
+
+config MODULE_COMPRESS_GZIP
+ bool "GZIP"
+ help
+ Support modules compressed with GZIP. The installed modules are
+ suffixed with .ko.gz.
+
+config MODULE_COMPRESS_XZ
+ bool "XZ"
+ help
+ Support modules compressed with XZ. The installed modules are
+ suffixed with .ko.xz.
+
+config MODULE_COMPRESS_ZSTD
+ bool "ZSTD"
+ help
+ Support modules compressed with ZSTD. The installed modules are
+ suffixed with .ko.zst.
+
+endchoice
+
+config MODULE_COMPRESS_ALL
+ bool "Automatically compress all modules"
+ default y
+ depends on MODULE_COMPRESS
+ help
+ Compress all modules during 'make modules_install'.
+
+ Your build system needs to provide the appropriate compression tool
+ for the selected compression type. External modules will also be
+ compressed in the same way during the installation.
+
+config MODULE_DECOMPRESS
+ bool "Support in-kernel module decompression"
+ depends on MODULE_COMPRESS
+ select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
+ select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
+ help
+ Support for decompressing kernel modules by the kernel itself
+ instead of relying on userspace to perform this task. Useful when
+ load pinning security policy is enabled.
+
+ If unsure, say N.
+
+config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ bool "Allow loading of modules with missing namespace imports"
+ help
+ Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in
+ a namespace. A module that makes use of a symbol exported with such a
+ namespace is required to import the namespace via MODULE_IMPORT_NS("").
+ There is no technical reason to enforce correct namespace imports,
+ but it creates consistency between symbols defining namespaces and
+ users importing namespaces they make use of. This option relaxes this
+ requirement and lifts the enforcement when loading a module.
+
+ If unsure, say N.
+
+config MODPROBE_PATH
+ string "Path to modprobe binary"
+ default "/sbin/modprobe"
+ help
+ When kernel code requests a module, it does so by calling
+ the "modprobe" userspace utility. This option allows you to
+ set the path where that binary is found. This can be changed
+ at runtime via the sysctl file
+ /proc/sys/kernel/modprobe. Setting this to the empty string
+ removes the kernel's ability to request modules (but
+ userspace can still load modules explicitly).
+
+config TRIM_UNUSED_KSYMS
+ bool "Trim unused exported kernel symbols"
+ help
+ The kernel and some modules make many symbols available for
+ other modules to use via EXPORT_SYMBOL() and variants. Depending
+ on the set of modules being selected in your kernel configuration,
+ many of those exported symbols might never be used.
+
+ This option allows for unused exported symbols to be dropped from
+ the build. In turn, this provides the compiler more opportunities
+ (especially when using LTO) for optimizing the code and reducing
+ binary size. This might have some security advantages as well.
+
+ If unsure, or if you need to build out-of-tree modules, say N.
+
+config UNUSED_KSYMS_WHITELIST
+ string "Whitelist of symbols to keep in ksymtab"
+ depends on TRIM_UNUSED_KSYMS
+ help
+ By default, all unused exported symbols will be un-exported from the
+ build when TRIM_UNUSED_KSYMS is selected.
+
+ UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept
+ exported at all times, even in absence of in-tree users. The value to
+ set here is the path to a text file containing the list of symbols,
+ one per line. The path can be absolute, or relative to the kernel
+ source or obj tree.
+
+config MODULES_TREE_LOOKUP
+ def_bool y
+ depends on PERF_EVENTS || TRACING || CFI
+
+endif # MODULES
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..50ffcc413b54
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_main.o := n
+
+obj-y += main.o
+obj-y += strict_rwx.o
+obj-y += kmod.o
+obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..df873dad049d
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ /* only scan writable, non-executable sections */
+ for_each_mod_mem_type(type) {
+ if (type != MOD_DATA && type != MOD_INIT_DATA &&
+ !mod->mem[type].is_rox)
+ kmemleak_no_scan(mod->mem[type].base);
+ }
+}
diff --git a/kernel/module_decompress.c b/kernel/module/decompress.c
index b01c69c2ff99..474e68f0f063 100644
--- a/kernel/module_decompress.c
+++ b/kernel/module/decompress.c
@@ -12,7 +12,7 @@
#include <linux/sysfs.h>
#include <linux/vmalloc.h>
-#include "module-internal.h"
+#include "internal.h"
static int module_extend_max_pages(struct load_info *info, unsigned int extent)
{
@@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info)
return page;
}
-#ifdef CONFIG_MODULE_COMPRESS_GZIP
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
#include <linux/zlib.h>
#define MODULE_COMPRESSION gzip
#define MODULE_DECOMPRESS_FN module_gzip_decompress
@@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
s.next_in = buf + gzip_hdr_len;
s.avail_in = size - gzip_hdr_len;
- s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
if (!s.workspace)
return -ENOMEM;
@@ -113,15 +113,16 @@ static ssize_t module_gzip_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out_inflate_end;
}
- s.next_out = kmap(page);
+ s.next_out = kmap_local_page(page);
s.avail_out = PAGE_SIZE;
rc = zlib_inflate(&s, 0);
- kunmap(page);
+ kunmap_local(s.next_out);
new_size += PAGE_SIZE - s.avail_out;
} while (rc == Z_OK);
@@ -137,10 +138,10 @@ static ssize_t module_gzip_decompress(struct load_info *info,
out_inflate_end:
zlib_inflateEnd(&s);
out:
- kfree(s.workspace);
+ kvfree(s.workspace);
return retval;
}
-#elif CONFIG_MODULE_COMPRESS_XZ
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
#include <linux/xz.h>
#define MODULE_COMPRESSION xz
#define MODULE_DECOMPRESS_FN module_xz_decompress
@@ -171,16 +172,17 @@ static ssize_t module_xz_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out;
}
- xz_buf.out = kmap(page);
+ xz_buf.out = kmap_local_page(page);
xz_buf.out_pos = 0;
xz_buf.out_size = PAGE_SIZE;
xz_ret = xz_dec_run(xz_dec, &xz_buf);
- kunmap(page);
+ kunmap_local(xz_buf.out);
new_size += xz_buf.out_pos;
} while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
@@ -197,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info,
xz_dec_end(xz_dec);
return retval;
}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kvmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap_local(zstd_dec.dst);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kvfree(wksp);
+ return retval;
+}
#else
#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
#endif
@@ -207,6 +297,10 @@ int module_decompress(struct load_info *info, const void *buf, size_t size)
ssize_t data_size;
int error;
+#if defined(CONFIG_MODULE_STATS)
+ info->compressed_len = size;
+#endif
+
/*
* Start with number of pages twice as big as needed for
* compressed data.
@@ -250,11 +344,13 @@ void module_decompress_cleanup(struct load_info *info)
info->max_pages = info->used_pages = 0;
}
+#ifdef CONFIG_SYSFS
static ssize_t compression_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
+ return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
}
+
static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
static int __init module_decompress_sysfs_init(void)
@@ -269,3 +365,4 @@ static int __init module_decompress_sysfs_init(void)
return 0;
}
late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
new file mode 100644
index 000000000000..bd2149fbe117
--- /dev/null
+++ b/kernel/module/dups.c
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * kmod dups - the kernel module autoloader duplicate suppressor
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define pr_fmt(fmt) "module: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE);
+module_param(enable_dups_trace, bool_enable_only, 0644);
+
+/*
+ * Protects dup_kmod_reqs list, adds / removals with RCU.
+ */
+static DEFINE_MUTEX(kmod_dup_mutex);
+static LIST_HEAD(dup_kmod_reqs);
+
+struct kmod_dup_req {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ struct completion first_req_done;
+ struct work_struct complete_work;
+ struct delayed_work delete_work;
+ int dup_ret;
+};
+
+static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name)
+{
+ struct kmod_dup_req *kmod_req;
+
+ list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list,
+ lockdep_is_held(&kmod_dup_mutex)) {
+ if (strlen(kmod_req->name) == strlen(module_name) &&
+ !memcmp(kmod_req->name, module_name, strlen(module_name))) {
+ return kmod_req;
+ }
+ }
+
+ return NULL;
+}
+
+static void kmod_dup_request_delete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+ kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work);
+
+ /*
+ * The typical situation is a module successully loaded. In that
+ * situation the module will be present already in userspace. If
+ * new requests come in after that, userspace will already know the
+ * module is loaded so will just return 0 right away. There is still
+ * a small chance right after we delete this entry new request_module()
+ * calls may happen after that, they can happen. These heuristics
+ * are to protect finit_module() abuse for auto-loading, if modules
+ * are still tryign to auto-load even if a module is already loaded,
+ * that's on them, and those inneficiencies should not be fixed by
+ * kmod. The inneficies there are a call to modprobe and modprobe
+ * just returning 0.
+ */
+ mutex_lock(&kmod_dup_mutex);
+ list_del_rcu(&kmod_req->list);
+ synchronize_rcu();
+ mutex_unlock(&kmod_dup_mutex);
+ kfree(kmod_req);
+}
+
+static void kmod_dup_request_complete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+
+ kmod_req = container_of(work, struct kmod_dup_req, complete_work);
+
+ /*
+ * This will ensure that the kernel will let all the waiters get
+ * informed its time to check the return value. It's time to
+ * go home.
+ */
+ complete_all(&kmod_req->first_req_done);
+
+ /*
+ * Now that we have allowed prior request_module() calls to go on
+ * with life, let's schedule deleting this entry. We don't have
+ * to do it right away, but we *eventually* want to do it so to not
+ * let this linger forever as this is just a boot optimization for
+ * possible abuses of vmalloc() incurred by finit_module() thrashing.
+ */
+ queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+}
+
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ struct kmod_dup_req *kmod_req, *new_kmod_req;
+ int ret;
+
+ /*
+ * Pre-allocate the entry in case we have to use it later
+ * to avoid contention with the mutex.
+ */
+ new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL);
+ if (!new_kmod_req)
+ return false;
+
+ memcpy(new_kmod_req->name, module_name, strlen(module_name));
+ INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete);
+ INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete);
+ init_completion(&new_kmod_req->first_req_done);
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req) {
+ /*
+ * If the first request that came through for a module
+ * was with request_module_nowait() we cannot wait for it
+ * and share its return value with other users which may
+ * have used request_module() and need a proper return value
+ * so just skip using them as an anchor.
+ *
+ * If a prior request to this one came through with
+ * request_module() though, then a request_module_nowait()
+ * would benefit from duplicate detection.
+ */
+ if (!wait) {
+ kfree(new_kmod_req);
+ pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+
+ /*
+ * There was no duplicate, just add the request so we can
+ * keep tab on duplicates later.
+ */
+ pr_debug("New request_module() for %s\n", module_name);
+ list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+ mutex_unlock(&kmod_dup_mutex);
+
+ /* We are dealing with a duplicate request now */
+ kfree(new_kmod_req);
+
+ /*
+ * To fix these try to use try_then_request_module() instead as that
+ * will check if the component you are looking for is present or not.
+ * You could also just queue a single request to load the module once,
+ * instead of having each and everything you need try to request for
+ * the module.
+ *
+ * Duplicate request_module() calls can cause quite a bit of wasted
+ * vmalloc() space when racing with userspace.
+ */
+ if (enable_dups_trace)
+ WARN(1, "module-autoload: duplicate request for module %s\n", module_name);
+ else
+ pr_warn("module-autoload: duplicate request for module %s\n", module_name);
+
+ if (!wait) {
+ /*
+ * If request_module_nowait() was used then the user just
+ * wanted to issue the request and if another module request
+ * was already its way with the same name we don't care for
+ * the return value either. Let duplicate request_module_nowait()
+ * calls bail out right away.
+ */
+ *dup_ret = 0;
+ return true;
+ }
+
+ /*
+ * If a duplicate request_module() was used they *may* care for
+ * the return value, so we have no other option but to wait for
+ * the first caller to complete. If the first caller used
+ * the request_module_nowait() call, subsquent callers will
+ * deal with the comprmise of getting a successful call with this
+ * optimization enabled ...
+ */
+ ret = wait_for_completion_state(&kmod_req->first_req_done,
+ TASK_KILLABLE);
+ if (ret) {
+ *dup_ret = ret;
+ return true;
+ }
+
+ /* Now the duplicate request has the same exact return value as the first request */
+ *dup_ret = kmod_req->dup_ret;
+
+ return true;
+}
+
+void kmod_dup_request_announce(char *module_name, int ret)
+{
+ struct kmod_dup_req *kmod_req;
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req)
+ goto out;
+
+ kmod_req->dup_ret = ret;
+
+ /*
+ * If we complete() here we may allow duplicate threads
+ * to continue before the first one that submitted the
+ * request. We're in no rush also, given that each and
+ * every bounce back to userspace is slow we avoid that
+ * with a slight delay here. So queueue up the completion
+ * and let duplicates suffer, just wait a tad bit longer.
+ * There is no rush. But we also don't want to hold the
+ * caller up forever or introduce any boot delays.
+ */
+ queue_work(system_wq, &kmod_req->complete_work);
+
+out:
+ mutex_unlock(&kmod_dup_mutex);
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..618202578b42
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/elf.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/mm.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
+ */
+
+#define SH_ENTSIZE_TYPE_BITS 4
+#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+struct kernel_symbol {
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ int value_offset;
+ int name_offset;
+ int namespace_offset;
+#else
+ unsigned long value;
+ const char *name;
+ const char *namespace;
+#endif
+};
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+extern const struct module_attribute *const modinfo_attrs[];
+extern const size_t modinfo_attrs_count;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const u32 __start___kcrctab[];
+extern const u32 __start___kcrctab_gpl[];
+
+#define KMOD_PATH_LEN 256
+extern char modprobe_path[];
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+#ifdef CONFIG_MODULE_STATS
+ unsigned long compressed_len;
+#endif
+ struct page **pages;
+ unsigned int max_pages;
+ unsigned int used_pages;
+#endif
+ struct {
+ unsigned int sym;
+ unsigned int str;
+ unsigned int mod;
+ unsigned int vers;
+ unsigned int info;
+ unsigned int pcpu;
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
+ } index;
+};
+
+enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+};
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const u32 *crc;
+ const struct kernel_symbol *sym;
+ enum mod_license license;
+};
+
+/* modules using other modules */
+struct module_use {
+ struct list_head source_list;
+ struct list_head target_list;
+ struct module *source, *target;
+};
+
+int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section);
+char *module_flags(struct module *mod, char *buf, bool show_state);
+size_t module_flags_taint(unsigned long taints, char *buf);
+
+char *module_next_tag_pair(char *string, unsigned long *secsize);
+
+#define for_each_modinfo_entry(entry, info, name) \
+ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+ mod->klp = true;
+ return true;
+#else
+ return false;
+#endif
+}
+
+/**
+ * enum fail_dup_mod_reason - state at which a duplicate module was detected
+ *
+ * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
+ * we've determined that another module with the same name is already loaded
+ * or being processed on our &modules list. This happens on early_mod_check()
+ * right before layout_and_allocate(). The kernel would have already
+ * vmalloc()'d space for the entire module through finit_module(). If
+ * decompression was used two vmap() spaces were used. These failures can
+ * happen when userspace has not seen the module present on the kernel and
+ * tries to load the module multiple times at same time.
+ * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
+ * checks and the kernel determines that the module was unique and because
+ * of this allocated yet another private kernel copy of the module space in
+ * layout_and_allocate() but after this determined in add_unformed_module()
+ * that another module with the same name is already loaded or being processed.
+ * These failures should be mitigated as much as possible and are indicative
+ * of really fast races in loading modules. Without module decompression
+ * they waste twice as much vmap space. With module decompression three
+ * times the module's size vmap space is wasted.
+ */
+enum fail_dup_mod_reason {
+ FAIL_DUP_MOD_BECOMING = 0,
+ FAIL_DUP_MOD_LOAD,
+};
+
+#ifdef CONFIG_MODULE_DEBUGFS
+extern struct dentry *mod_debugfs_root;
+#endif
+
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add_long(count, var) atomic_long_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic_long_t total_mod_size;
+extern atomic_long_t total_text_size;
+extern atomic_long_t invalid_kread_bytes;
+extern atomic_long_t invalid_decompress_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+struct mod_fail_load {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ atomic_long_t count;
+ unsigned long dup_fail_mask;
+};
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
+void mod_stat_bump_invalid(struct load_info *info, int flags);
+void mod_stat_bump_becoming(struct load_info *info, int flags);
+
+#else
+
+#define mod_stat_add_long(name, var)
+#define mod_stat_inc(name)
+
+static inline int try_add_failed_module(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ return 0;
+}
+
+static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+}
+
+static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+}
+
+#endif /* CONFIG_MODULE_STATS */
+
+#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
+void kmod_dup_request_announce(char *module_name, int ret);
+#else
+static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ return false;
+}
+
+static inline void kmod_dup_request_announce(char *module_name, int ret)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ unsigned long taints;
+ u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+ return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
+
+struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ struct latch_tree_root root;
+#endif
+ unsigned long addr_min;
+ unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ unsigned long data_addr_min;
+ unsigned long data_addr_max;
+#endif
+};
+
+extern struct mod_tree_root mod_tree;
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+int module_enable_rodata_ro(const struct module *mod);
+int module_enable_rodata_ro_after_init(const struct module *mod);
+int module_enable_data_nx(const struct module *mod);
+int module_enable_text_rox(const struct module *mod);
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod);
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings);
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+ struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+ const char *symname, struct module *mod, const u32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+ struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+struct modversion_info_ext {
+ size_t remaining;
+ const u32 *crc;
+ const char *name;
+};
+void modversion_ext_start(const struct load_info *info, struct modversion_info_ext *ver);
+void modversion_ext_advance(struct modversion_info_ext *ver);
+#define for_each_modversion_info_ext(ver, info) \
+ for (modversion_ext_start(info, &ver); ver.remaining > 0; modversion_ext_advance(&ver))
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const u32 *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..00a60796327c
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/module_symbol.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+
+ if (!mod)
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+ return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+ sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum, unsigned int pcpundx)
+{
+ const Elf_Shdr *sec;
+ enum mod_mem_type type;
+
+ if (src->st_shndx == SHN_UNDEF ||
+ src->st_shndx >= shnum ||
+ !src->st_name)
+ return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
+ sec = sechdrs + src->st_shndx;
+ type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || mod_mem_type_is_init(type))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+ struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+ struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ symsect, info->index.sym);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod_mem_data->size += strtab_size;
+ /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
+ info->core_typeoffs = mod_mem_data->size;
+ mod_mem_data->size += ndst * sizeof(char);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ strsect, info->index.str);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+ mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod_mem_init_data->size;
+ mod_mem_init_data->size += nsrc * sizeof(char);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ unsigned long strtab_size;
+ void *data_base = mod->mem[MOD_DATA].base;
+ void *init_data_base = mod->mem[MOD_INIT_DATA].base;
+ struct mod_kallsyms *kallsyms;
+
+ kallsyms = init_data_base + info->mod_kallsyms_init_off;
+
+ kallsyms->symtab = (void *)symsec->sh_addr;
+ kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ kallsyms->typetab = init_data_base + info->init_typeoffs;
+
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
+ mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+ mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+ mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
+ strtab_size = info->core_typeoffs - info->stroffs;
+ src = kallsyms->symtab;
+ for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
+ kallsyms->typetab[i] = elf_type(src + i, info);
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ ssize_t ret;
+
+ mod->core_kallsyms.typetab[ndst] =
+ kallsyms->typetab[i];
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ ret = strscpy(s, &kallsyms->strtab[src[i].st_name],
+ strtab_size);
+ if (ret < 0)
+ break;
+ s += ret + 1;
+ strtab_size -= ret + 1;
+ }
+ }
+
+ /* Set up to point into init section. */
+ rcu_assign_pointer(mod->kallsyms, kallsyms);
+ mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval, bestval;
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
+ struct module_memory *mod_mem;
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ mod_mem = &mod->mem[MOD_INIT_TEXT];
+ else
+ mod_mem = &mod->mem[MOD_TEXT];
+
+ nextval = (unsigned long)mod_mem->base + mod_mem->size;
+
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+ is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ continue;
+
+ if (thisval <= addr && thisval > bestval) {
+ best = i;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - bestval;
+ if (offset)
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, RCU is enough.
+ */
+int module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ const unsigned char **modbuildid,
+ char *namebuf)
+{
+ const char *sym;
+ int ret = 0;
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
+
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
+
+ if (sym)
+ ret = strscpy(namebuf, sym, KSYM_NAME_LEN);
+ }
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+
+ strscpy(symname, sym, KSYM_NAME_LEN);
+ return 0;
+ }
+ }
+out:
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ kallsyms = rcu_dereference(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = kallsyms->typetab[symnum];
+ strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ strscpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ return 0;
+ }
+ symnum -= kallsyms->num_symtab;
+ }
+ return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
+ return 0;
+}
+
+static unsigned long __module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+
+ colon = strnchr(name, MODULE_NAME_LEN, ':');
+ if (colon) {
+ mod = find_module_all(name, colon - name, false);
+ if (mod)
+ return __find_kallsyms_symbol_value(mod, colon + 1);
+ return 0;
+ }
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ unsigned long ret;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ ret = __find_kallsyms_symbol_value(mod, name);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ /* Don't lock: we're in enough trouble already. */
+ guard(rcu)();
+ return __module_kallsyms_lookup_name(name);
+}
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ guard(rcu)();
+ return __find_kallsyms_symbol_value(mod, name);
+}
+
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
+ kallsyms = rcu_dereference_check(mod->kallsyms,
+ lockdep_is_held(&module_mutex));
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ kallsyms_symbol_value(sym));
+ if (ret != 0)
+ goto out;
+ }
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..995c32d3698f
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+ kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+ kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+ kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
+ kdb_printf(" 0x%px ", (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
diff --git a/kernel/kmod.c b/kernel/module/kmod.c
index b717134ebe17..25f253812512 100644
--- a/kernel/kmod.c
+++ b/kernel/module/kmod.c
@@ -1,6 +1,9 @@
/*
* kmod - the kernel module loader
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -12,7 +15,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
@@ -27,6 +29,7 @@
#include <linux/uaccess.h>
#include <trace/events/module.h>
+#include "internal.h"
/*
* Assuming:
@@ -40,8 +43,7 @@
* effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
-static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
@@ -66,7 +68,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
kfree(info->argv);
}
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *orig_module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
@@ -75,12 +77,14 @@ static int call_modprobe(char *module_name, int wait)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
+ char *module_name;
+ int ret;
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
- module_name = kstrdup(module_name, GFP_KERNEL);
+ module_name = kstrdup(orig_module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
@@ -95,13 +99,16 @@ static int call_modprobe(char *module_name, int wait)
if (!info)
goto free_module_name;
- return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ kmod_dup_request_announce(orig_module_name, ret);
+ return ret;
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
+ kmod_dup_request_announce(orig_module_name, -ENOMEM);
return -ENOMEM;
}
@@ -125,7 +132,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- int ret;
+ int ret, dup_ret;
/*
* We don't allow synchronous module loading from async. Module
@@ -148,29 +155,24 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
- pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
- atomic_read(&kmod_concurrent_max),
- MAX_KMOD_CONCURRENT, module_name);
- ret = wait_event_killable_timeout(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
- MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
- if (!ret) {
- pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
- module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
- return -ETIME;
- } else if (ret == -ERESTARTSYS) {
- pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
- return ret;
- }
+ ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return ret;
}
trace_module_request(module_name, wait, _RET_IP_);
+ if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
+ ret = dup_ret;
+ goto out;
+ }
+
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_inc(&kmod_concurrent_max);
- wake_up(&kmod_wq);
+out:
+ up(&kmod_concurrent_max);
return ret;
}
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..a89f01e1d6b7
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist ELF information about a module. Copy the ELF header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (!mod->klp_info)
+ return -ENOMEM;
+
+ /* ELF header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* ELF section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ if (!mod->klp_info->sechdrs) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+
+ /* ELF section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ if (!mod->klp_info->secstrings) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+
+ /* ELF symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
new file mode 100644
index 000000000000..710ee30b3bea
--- /dev/null
+++ b/kernel/module/main.c
@@ -0,0 +1,3923 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define INCLUDE_VERMAGIC
+
+#include <linux/export.h>
+#include <linux/extable.h>
+#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
+#include <linux/trace_events.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <linux/set_memory.h>
+#include <asm/mmu_context.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/livepatch.h>
+#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
+#include <linux/cfi.h>
+#include <linux/codetag.h>
+#include <linux/debugfs.h>
+#include <linux/execmem.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable within RCU read section),
+ * 2) module_use links,
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
+ * (delete and add uses RCU list operations).
+ */
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
+
+/* Work queue for freeing init sections in success case */
+static void do_free_init(struct work_struct *w);
+static DECLARE_WORK(init_free_wq, do_free_init);
+static LLIST_HEAD(init_free_list);
+
+struct mod_tree_root mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const u32 *crcs;
+ enum mod_license license;
+};
+
+/*
+ * Bounds of module memory, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+ unsigned int size, struct mod_tree_root *tree)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (mod_mem_type_is_core_data(type)) {
+ if (min < tree->data_addr_min)
+ tree->data_addr_min = min;
+ if (max > tree->data_addr_max)
+ tree->data_addr_max = max;
+ return;
+ }
+#endif
+ if (min < tree->addr_min)
+ tree->addr_min = min;
+ if (max > tree->addr_max)
+ tree->addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size)
+ __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+ }
+}
+
+/* Block module loading/unloading? */
+static int modules_disabled;
+core_param(nomodule, modules_disabled, bint, 0);
+
+static const struct ctl_table module_sysctl_table[] = {
+ {
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init init_module_sysctl(void)
+{
+ register_sysctl_init("kernel", module_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_module_sysctl);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+int register_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
+static inline int strong_try_module_get(struct module *mod)
+{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return -EBUSY;
+ if (try_module_get(mod))
+ return 0;
+ else
+ return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
+{
+ add_taint(flag, lockdep_ok);
+ set_bit(flag, &mod->taints);
+}
+
+/*
+ * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule.
+ */
+static int mod_strncmp(const char *str_a, const char *str_b, size_t n)
+{
+ for (int i = 0; i < n; i++) {
+ char a = str_a[i];
+ char b = str_b[i];
+ int d;
+
+ if (a == '-') a = '_';
+ if (b == '-') b = '_';
+
+ d = a - b;
+ if (d)
+ return d;
+
+ if (!a)
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit.
+ */
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ /* Alloc bit cleared means "ignore it." */
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/**
+ * find_any_unique_sec() - Find a unique section index by name
+ * @info: Load info for the module to scan
+ * @name: Name of the section we're looking for
+ *
+ * Locates a unique section by name. Ignores SHF_ALLOC.
+ *
+ * Return: Section index if found uniquely, zero if absent, negative count
+ * of total instances if multiple were found.
+ */
+static int find_any_unique_sec(const struct load_info *info, const char *name)
+{
+ unsigned int idx;
+ unsigned int count = 0;
+ int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (strcmp(info->secstrings + info->sechdrs[i].sh_name,
+ name) == 0) {
+ count++;
+ idx = i;
+ }
+ }
+ if (count == 1) {
+ return idx;
+ } else if (count == 0) {
+ return 0;
+ } else {
+ return -count;
+ }
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(const struct load_info *info, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
+static unsigned int find_any_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/*
+ * Find a module section, or NULL. Fill in number of "objects" in section.
+ * Ignores SHF_ALLOC flag.
+ */
+static __maybe_unused void *any_section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_any_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return offset_to_ptr(&sym->name_offset);
+#else
+ return sym->name;
+#endif
+}
+
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
+}
+
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ struct find_symbol_arg *fsa)
+{
+ struct kernel_symbol *sym;
+
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
+
+ sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ sizeof(struct kernel_symbol), cmp_name);
+ if (!sym)
+ return false;
+
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, sym - syms->start);
+ fsa->sym = sym;
+ fsa->license = syms->license;
+
+ return true;
+}
+
+/*
+ * Find an exported symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it. Needs RCU or module_mutex.
+ */
+bool find_symbol(struct find_symbol_arg *fsa)
+{
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY },
+ };
+ struct module *mod;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY },
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+ return true;
+ }
+
+ pr_debug("Failed to find symbol %s\n", fsa->name);
+ return false;
+}
+
+/*
+ * Search for module by name: must hold module_mutex (or RCU for read-only
+ * access).
+ */
+struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+ return mod;
+ }
+ return NULL;
+}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, strlen(name), false);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return mod->percpu;
+}
+
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ unsigned long align = pcpusec->sh_addralign;
+
+ if (!pcpusec->sh_size)
+ return 0;
+
+ if (align > PAGE_SIZE) {
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+ if (!mod->percpu) {
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
+ return -ENOMEM;
+ }
+ mod->percpu_size = pcpusec->sh_size;
+ return 0;
+}
+
+static void percpu_modfree(struct module *mod)
+{
+ free_percpu(mod->percpu);
+}
+
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return find_sec(info, ".data..percpu");
+}
+
+static void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ struct module *mod;
+ unsigned int cpu;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (!mod->percpu_size)
+ continue;
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(mod->percpu, cpu);
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/**
+ * is_module_percpu_address() - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * Return: %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
+#else /* ... !CONFIG_SMP */
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return NULL;
+}
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ return -ENOMEM;
+ return 0;
+}
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return 0;
+}
+static inline void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+bool is_module_percpu_address(unsigned long addr)
+{
+ return false;
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(const struct module_attribute *mattr, \
+ struct module_kobject *mk, char *buffer) \
+{ \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static const struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444 }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct {
+ char name[MODULE_NAME_LEN];
+ char taints[MODULE_FLAGS_BUF_SIZE];
+} last_unloaded_module;
+
+#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
+/* Init the unload section of the module. */
+static int module_unload_init(struct module *mod)
+{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
+
+ /* Hold reference count during initialization. */
+ atomic_inc(&mod->refcnt);
+
+ return 0;
+}
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a)
+ return 1;
+ }
+ pr_debug("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ pr_debug("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use)
+ return -ENOMEM;
+
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+static int ref_module(struct module *a, struct module *b)
+{
+ int err;
+
+ if (b == NULL || already_uses(a, b))
+ return 0;
+
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
+ if (err)
+ return err;
+
+ err = add_module_usage(a, b);
+ if (err) {
+ module_put(b);
+ return err;
+ }
+ return 0;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module_use *use, *tmp;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ pr_debug("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
+ }
+ mutex_unlock(&module_mutex);
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+ return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
+{
+ int ret;
+
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ /* If it's not unused, quit unless we're forcing. */
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ mod->state = MODULE_STATE_GOING;
+
+ return 0;
+}
+
+/**
+ * module_refcount() - return the refcount or -1 if unloading
+ * @mod: the module we're checking
+ *
+ * Return:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
+{
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ unsigned int, flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ int ret, len, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ len = strncpy_from_user(name, name_user, MODULE_NAME_LEN);
+ if (len == 0 || len == MODULE_NAME_LEN)
+ return -ENOENT;
+ if (len < 0)
+ return len;
+
+ audit_log_kern_module(name);
+
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->source_list)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count damn the torpedoes */
+ pr_debug("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if (mod->init && !mod->exit) {
+ forced = try_force_unload(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ mutex_unlock(&module_mutex);
+ /* Final destruction now no one is using it. */
+ if (mod->exit != NULL)
+ mod->exit();
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+
+ async_synchronize_full();
+
+ /* Store the name and taints of the last unloaded module for diagnostic purposes */
+ strscpy(last_unloaded_module.name, mod->name);
+ strscpy(last_unloaded_module.taints, module_flags(mod, buf, false));
+
+ free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
+ return 0;
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ };
+
+ guard(rcu)();
+ BUG_ON(!find_symbol(&fsa));
+ module_put(fsa.owner);
+}
+EXPORT_SYMBOL(__symbol_put);
+
+/* Note this assumes addr is a function, which it currently always is. */
+void symbol_put_addr(void *addr)
+{
+ struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+
+ if (core_kernel_text(a))
+ return;
+
+ /*
+ * Even though we hold a reference on the module; we still need to
+ * RCU read section in order to safely traverse the data structure.
+ */
+ guard(rcu)();
+ modaddr = __module_text_address(a);
+ BUG_ON(!modaddr);
+ module_put(modaddr);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+}
+
+static const struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
+
+void __module_get(struct module *module)
+{
+ if (module) {
+ atomic_inc(&module->refcnt);
+ trace_module_get(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
+ trace_module_get(module, _RET_IP_);
+ else
+ ret = false;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
+void module_put(struct module *module)
+{
+ int ret;
+
+ if (module) {
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
+ trace_module_put(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static int ref_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b);
+}
+
+static inline int module_unload_init(struct module *mod)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+size_t module_flags_taint(unsigned long taints, char *buf)
+{
+ size_t l = 0;
+ int i;
+
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ if (test_bit(i, &taints))
+ buf[l++] = taint_flags[i].c_true;
+ }
+
+ return l;
+}
+
+static ssize_t show_initstate(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mk->mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ default:
+ BUG();
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static const struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
+
+static ssize_t store_uevent(const struct module_attribute *mattr,
+ struct module_kobject *mk,
+ const char *buffer, size_t count)
+{
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
+}
+
+const struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ }
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_datasize =
+ __ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
+static ssize_t show_initsize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, init)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod->taints, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static const struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
+
+const struct module_attribute *const modinfo_attrs[] = {
+ &module_uevent,
+ &modinfo_version,
+ &modinfo_srcversion,
+ &modinfo_initstate,
+ &modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ &modinfo_datasize,
+#endif
+ &modinfo_initsize,
+ &modinfo_taint,
+#ifdef CONFIG_MODULE_UNLOAD
+ &modinfo_refcnt,
+#endif
+ NULL,
+};
+
+const size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+int try_to_force_load(struct module *mod, const char *reason)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!test_taint(TAINT_FORCED_MODULE))
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
+/* Parse tag=value strings from .modinfo section */
+char *module_next_tag_pair(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
+
+ /*
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
+ */
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = module_next_tag_pair(prev, &size);
+ }
+
+ for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
+/**
+ * verify_module_namespace() - does @modname have access to this symbol's @namespace
+ * @namespace: export symbol namespace
+ * @modname: module name
+ *
+ * If @namespace is prefixed with "module:" to indicate it is a module namespace
+ * then test if @modname matches any of the comma separated patterns.
+ *
+ * The patterns only support tail-glob.
+ */
+static bool verify_module_namespace(const char *namespace, const char *modname)
+{
+ size_t len, modlen = strlen(modname);
+ const char *prefix = "module:";
+ const char *sep;
+ bool glob;
+
+ if (!strstarts(namespace, prefix))
+ return false;
+
+ for (namespace += strlen(prefix); *namespace; namespace = sep) {
+ sep = strchrnul(namespace, ',');
+ len = sep - namespace;
+
+ glob = false;
+ if (sep[-1] == '*') {
+ len--;
+ glob = true;
+ }
+
+ if (*sep)
+ sep++;
+
+ if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen))
+ return true;
+ }
+
+ return false;
+}
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace && namespace[0]) {
+
+ if (verify_module_namespace(namespace, mod->name))
+ return 0;
+
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+ mod->name, name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+ mod->name, name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
+ const char *name,
+ char ownername[])
+{
+ struct find_symbol_arg fsa = {
+ .name = name,
+ .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+ .warn = true,
+ };
+ int err;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ if (!find_symbol(&fsa))
+ goto unlock;
+
+ if (fsa.license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, fsa.owner, name)) {
+ fsa.sym = NULL;
+ goto getname;
+ }
+
+ if (!check_version(info, name, mod, fsa.crc)) {
+ fsa.sym = ERR_PTR(-EINVAL);
+ goto getname;
+ }
+
+ err = verify_namespace_is_imported(info, fsa.sym, mod);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+ err = ref_module(mod, fsa.owner);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
+ return fsa.sym;
+}
+
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
+{
+ const struct kernel_symbol *ksym;
+ char owner[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
+ }
+ return ksym;
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
+{
+ unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+ enum execmem_type execmem_type;
+ void *ptr;
+
+ mod->mem[type].size = size;
+
+ if (mod_mem_type_is_data(type))
+ execmem_type = EXECMEM_MODULE_DATA;
+ else
+ execmem_type = EXECMEM_MODULE_TEXT;
+
+ ptr = execmem_alloc_rw(execmem_type, size);
+ if (!ptr)
+ return -ENOMEM;
+
+ mod->mem[type].is_rox = execmem_is_rox(execmem_type);
+
+ /*
+ * The pointer to these blocks of memory are stored on the module
+ * structure and we keep that around so long as the module is
+ * around. We only free that memory when we unload the module.
+ * Just mark them as not being a leak then. The .init* ELF
+ * sections *do* get freed after boot so we *could* treat them
+ * slightly differently with kmemleak_ignore() and only grey
+ * them out as they work as typical memory allocations which
+ * *do* eventually get freed, but let's just keep things simple
+ * and avoid *any* false positives.
+ */
+ if (!mod->mem[type].is_rox)
+ kmemleak_not_leak(ptr);
+
+ memset(ptr, 0, size);
+ mod->mem[type].base = ptr;
+
+ return 0;
+}
+
+static void module_memory_restore_rox(struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox)
+ execmem_restore_rox(mem->base, mem->size);
+ }
+}
+
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
+{
+ struct module_memory *mem = &mod->mem[type];
+
+ execmem_free(mem->base);
+}
+
+static void free_mod_mem(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (type == MOD_DATA)
+ continue;
+
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ if (mod_mem->size)
+ module_memory_free(mod, type);
+ }
+
+ /* MOD_DATA hosts mod, so free it at last */
+ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ module_memory_free(mod, MOD_DATA);
+}
+
+/* Free a module, remove from lists, etc. */
+static void free_module(struct module *mod)
+{
+ trace_module_free(mod);
+
+ codetag_unload_module(mod);
+
+ mod_sysfs_teardown(mod);
+
+ /*
+ * We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed.
+ */
+ mutex_lock(&module_mutex);
+ mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* Free any allocated parameters. */
+ destroy_params(mod->kp, mod->num_kp);
+
+ if (is_livepatch_module(mod))
+ free_module_elf(mod);
+
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
+ if (try_add_tainted_module(mod))
+ pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+ mod->name);
+ mutex_unlock(&module_mutex);
+
+ /* This may be empty, but that's OK */
+ module_arch_freeing_init(mod);
+ kfree(mod->args);
+ percpu_modfree(mod);
+
+ free_mod_mem(mod);
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ .warn = true,
+ };
+
+ scoped_guard(rcu) {
+ if (!find_symbol(&fsa))
+ return NULL;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ return NULL;
+ }
+ if (strong_try_module_get(fsa.owner))
+ return NULL;
+ }
+ return (void *)kernel_symbol_value(fsa.sym);
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
+ */
+static int verify_exported_symbols(struct module *mod)
+{
+ unsigned int i;
+ const struct kernel_symbol *s;
+ struct {
+ const struct kernel_symbol *sym;
+ unsigned int num;
+ } arr[] = {
+ { mod->syms, mod->num_syms },
+ { mod->gpl_syms, mod->num_gpl_syms },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
+ pr_err("%s: exports duplicate symbol %s"
+ " (owned by %s)\n",
+ mod->name, kernel_symbol_name(s),
+ module_name(fsa.owner));
+ return -ENOEXEC;
+ }
+ }
+ }
+ return 0;
+}
+
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
+ unsigned long secbase;
+ unsigned int i;
+ int ret = 0;
+ const struct kernel_symbol *ksym;
+
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
+ /*
+ * We compiled with -fno-common. These are not
+ * supposed to happen.
+ */
+ pr_debug("Common symbol: %s\n", name);
+ pr_warn("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ pr_debug("Absolute symbol: 0x%08lx %s\n",
+ (long)sym[i].st_value, name);
+ break;
+
+ case SHN_LIVEPATCH:
+ /* Livepatch symbols are resolved by livepatch */
+ break;
+
+ case SHN_UNDEF:
+ ksym = resolve_symbol_wait(mod, info, name);
+ /* Ok if resolved. */
+ if (ksym && !IS_ERR(ksym)) {
+ sym[i].st_value = kernel_symbol_value(ksym);
+ break;
+ }
+
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
+ break;
+
+ ret = PTR_ERR(ksym) ?: -ENOENT;
+ pr_warn("%s: Unknown symbol %s (err %d)\n",
+ mod->name, name, ret);
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == info->index.pcpu)
+ secbase = (unsigned long)mod_percpu(mod);
+ else
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /*
+ * Don't bother with non-allocated sections.
+ * An exception is the percpu section, which has separate allocations
+ * for individual CPUs. We relocate the percpu section in the initial
+ * ELF template and subsequently copy it to the per-CPU destinations.
+ */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) &&
+ (!infosec || infosec != info->index.pcpu))
+ continue;
+
+ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+ err = klp_apply_section_relocs(mod, info->sechdrs,
+ info->secstrings,
+ info->strtab,
+ info->index.sym, i,
+ NULL);
+ else if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+ unsigned int section)
+{
+ /* default implementation just returns zero */
+ return 0;
+}
+
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section)
+{
+ long offset;
+ long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
+
+ mod->mem[type].size += arch_mod_section_prepend(mod, section);
+ offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+ mod->mem[type].size = offset + sechdr->sh_size;
+
+ WARN_ON_ONCE(offset & mask);
+ return offset | mask;
+}
+
+bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+ if (module_exit_section(sname))
+ return true;
+#endif
+ return module_init_section(sname);
+}
+
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
+{
+ unsigned int m, i;
+
+ /*
+ * { Mask of required section header flags,
+ * Mask of excluded section header flags }
+ */
+ static const unsigned long masks[][2] = {
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ static const int core_m_to_mem_type[] = {
+ MOD_TEXT,
+ MOD_RODATA,
+ MOD_RO_AFTER_INIT,
+ MOD_DATA,
+ MOD_DATA,
+ };
+ static const int init_m_to_mem_type[] = {
+ MOD_INIT_TEXT,
+ MOD_INIT_RODATA,
+ MOD_INVALID,
+ MOD_INIT_DATA,
+ MOD_INIT_DATA,
+ };
+
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || is_init != module_init_layout_section(sname))
+ continue;
+
+ if (WARN_ON_ONCE(type == MOD_INVALID))
+ continue;
+
+ /*
+ * Do not allocate codetag memory as we load it into
+ * preallocated contiguous memory.
+ */
+ if (codetag_needs_module_section(mod, sname, s->sh_size)) {
+ /*
+ * s->sh_entsize won't be used but populate the
+ * type field to avoid confusion.
+ */
+ s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK)
+ << SH_ENTSIZE_TYPE_SHIFT;
+ continue;
+ }
+
+ s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
+ pr_debug("\t%s\n", sname);
+ }
+ }
+}
+
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, false);
+
+ pr_debug("Init section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, true);
+}
+
+static void module_license_taint_check(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ if (!license_is_gpl_compatible(license)) {
+ if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+}
+
+static int setup_modinfo(struct module *mod, struct load_info *info)
+{
+ const struct module_attribute *attr;
+ char *imported_namespace;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
+ }
+
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ /*
+ * 'module:' prefixed namespaces are implicit, disallow
+ * explicit imports.
+ */
+ if (strstarts(imported_namespace, "module:")) {
+ pr_err("%s: module tries to import module namespace: %s\n",
+ mod->name, imported_namespace);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+static void free_modinfo(struct module *mod)
+{
+ const struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->free)
+ attr->free(mod);
+ }
+}
+
+bool __weak module_init_section(const char *name)
+{
+ return strstarts(name, ".init");
+}
+
+bool __weak module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit");
+}
+
+static int validate_section_offset(const struct load_info *info, Elf_Shdr *shdr)
+{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
+ unsigned long secend;
+#endif
+
+ /*
+ * Check for both overflow and offset/size being
+ * too large.
+ */
+ secend = shdr->sh_offset + shdr->sh_size;
+ if (secend < shdr->sh_offset || secend > info->len)
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/**
+ * elf_validity_ehdr() - Checks an ELF header for module validity
+ * @info: Load info containing the ELF header to check
+ *
+ * Checks whether an ELF header could belong to a valid module. Checks:
+ *
+ * * ELF header is within the data the user provided
+ * * ELF magic is present
+ * * It is relocatable (not final linked, not core file, etc.)
+ * * The header's machine type matches what the architecture expects.
+ * * Optional arch-specific hook for other properties
+ * - module_elf_check_arch() is currently only used by PPC to check
+ * ELF ABI version, but may be used by others in the future.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_ehdr(const struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ return -ENOEXEC;
+ }
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ return -ENOEXEC;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ return -ENOEXEC;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ return -ENOEXEC;
+ }
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ return -ENOEXEC;
+ }
+ return 0;
+}
+
+/**
+ * elf_validity_cache_sechdrs() - Cache section headers if valid
+ * @info: Load info to compute section headers from
+ *
+ * Checks:
+ *
+ * * ELF header is valid (see elf_validity_ehdr())
+ * * Section headers are the size we expect
+ * * Section array fits in the user provided data
+ * * Section index 0 is NULL
+ * * Section contents are inbounds
+ *
+ * Then updates @info with a &load_info->sechdrs pointer if valid.
+ *
+ * Return: %0 if valid, negative error code if validation failed.
+ */
+static int elf_validity_cache_sechdrs(struct load_info *info)
+{
+ Elf_Shdr *sechdrs;
+ Elf_Shdr *shdr;
+ int i;
+ int err;
+
+ err = elf_validity_ehdr(info);
+ if (err < 0)
+ return err;
+
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ return -ENOEXEC;
+ }
+
+ /*
+ * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ * known and small. So e_shnum * sizeof(Elf_Shdr)
+ * will not overflow unsigned long on any platform.
+ */
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ return -ENOEXEC;
+ }
+
+ sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+ /*
+ * The code assumes that section 0 has a length of zero and
+ * an addr of zero, so check for it.
+ */
+ if (sechdrs[0].sh_type != SHT_NULL
+ || sechdrs[0].sh_size != 0
+ || sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ sechdrs[0].sh_type);
+ return -ENOEXEC;
+ }
+
+ /* Validate contents are inbounds */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ shdr = &sechdrs[i];
+ switch (shdr->sh_type) {
+ case SHT_NULL:
+ case SHT_NOBITS:
+ /* No contents, offset/size don't mean anything */
+ continue;
+ default:
+ err = validate_section_offset(info, shdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return err;
+ }
+ }
+ }
+
+ info->sechdrs = sechdrs;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_secstrings() - Caches section names if valid
+ * @info: Load info to cache section names from. Must have valid sechdrs.
+ *
+ * Specifically checks:
+ *
+ * * Section name table index is inbounds of section headers
+ * * Section name table is not empty
+ * * Section name table is NUL terminated
+ * * All section name offsets are inbounds of the section
+ *
+ * Then updates @info with a &load_info->secstrings pointer if valid.
+ *
+ * Return: %0 if valid, negative error code if validation failed.
+ */
+static int elf_validity_cache_secstrings(struct load_info *info)
+{
+ Elf_Shdr *strhdr, *shdr;
+ char *secstrings;
+ int i;
+
+ /*
+ * Verify if the section name table index is valid.
+ */
+ if (info->hdr->e_shstrndx == SHN_UNDEF
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ return -ENOEXEC;
+ }
+
+ strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+
+ /*
+ * The section name table must be NUL-terminated, as required
+ * by the spec. This makes strcmp and pr_* calls that access
+ * strings in the section safe.
+ */
+ secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (strhdr->sh_size == 0) {
+ pr_err("empty section name table\n");
+ return -ENOEXEC;
+ }
+ if (secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ return -ENOEXEC;
+ }
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ shdr = &info->sechdrs[i];
+ /* SHT_NULL means sh_name has an undefined value */
+ if (shdr->sh_type == SHT_NULL)
+ continue;
+ if (shdr->sh_name >= strhdr->sh_size) {
+ pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return -ENOEXEC;
+ }
+ }
+
+ info->secstrings = secstrings;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_info() - Validate and cache modinfo section
+ * @info: Load info to populate the modinfo index on.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated
+ *
+ * Checks that if there is a .modinfo section, it is unique.
+ * Then, it caches its index in &load_info->index.info.
+ * Finally, it tries to populate the name to improve error messages.
+ *
+ * Return: %0 if valid, %-ENOEXEC if multiple modinfo sections were found.
+ */
+static int elf_validity_cache_index_info(struct load_info *info)
+{
+ int info_idx;
+
+ info_idx = find_any_unique_sec(info, ".modinfo");
+
+ if (info_idx == 0)
+ /* Early return, no .modinfo */
+ return 0;
+
+ if (info_idx < 0) {
+ pr_err("Only one .modinfo section must exist.\n");
+ return -ENOEXEC;
+ }
+
+ info->index.info = info_idx;
+ /* Try to find a name early so we can log errors with a module name */
+ info->name = get_modinfo(info, "name");
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_mod() - Validates and caches this_module section
+ * @info: Load info to cache this_module on.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated
+ *
+ * The ".gnu.linkonce.this_module" ELF section is special. It is what modpost
+ * uses to refer to __this_module and let's use rely on THIS_MODULE to point
+ * to &__this_module properly. The kernel's modpost declares it on each
+ * modules's *.mod.c file. If the struct module of the kernel changes a full
+ * kernel rebuild is required.
+ *
+ * We have a few expectations for this special section, this function
+ * validates all this for us:
+ *
+ * * The section has contents
+ * * The section is unique
+ * * We expect the kernel to always have to allocate it: SHF_ALLOC
+ * * The section size must match the kernel's run time's struct module
+ * size
+ *
+ * If all checks pass, the index will be cached in &load_info->index.mod
+ *
+ * Return: %0 on validation success, %-ENOEXEC on failure
+ */
+static int elf_validity_cache_index_mod(struct load_info *info)
+{
+ Elf_Shdr *shdr;
+ int mod_idx;
+
+ mod_idx = find_any_unique_sec(info, ".gnu.linkonce.this_module");
+ if (mod_idx <= 0) {
+ pr_err("module %s: Exactly one .gnu.linkonce.this_module section must exist.\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ shdr = &info->sechdrs[mod_idx];
+
+ if (shdr->sh_type == SHT_NOBITS) {
+ pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ if (!(shdr->sh_flags & SHF_ALLOC)) {
+ pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ if (shdr->sh_size != sizeof(struct module)) {
+ pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ info->index.mod = mod_idx;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_sym() - Validate and cache symtab index
+ * @info: Load info to cache symtab index in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ *
+ * Checks that there is exactly one symbol table, then caches its index in
+ * &load_info->index.sym.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_sym(struct load_info *info)
+{
+ unsigned int sym_idx;
+ unsigned int num_sym_secs = 0;
+ int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ num_sym_secs++;
+ sym_idx = i;
+ }
+ }
+
+ if (num_sym_secs != 1) {
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ info->index.sym = sym_idx;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_str() - Validate and cache strtab index
+ * @info: Load info to cache strtab index in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * Must have &load_info->index.sym populated.
+ *
+ * Looks at the symbol table's associated string table, makes sure it is
+ * in-bounds, and caches it.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_str(struct load_info *info)
+{
+ unsigned int str_idx = info->sechdrs[info->index.sym].sh_link;
+
+ if (str_idx == SHN_UNDEF || str_idx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ str_idx, str_idx, info->hdr->e_shnum);
+ return -ENOEXEC;
+ }
+
+ info->index.str = str_idx;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_versions() - Validate and cache version indices
+ * @info: Load info to cache version indices in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * @flags: Load flags, relevant to suppress version loading, see
+ * uapi/linux/module.h
+ *
+ * If we're ignoring modversions based on @flags, zero all version indices
+ * and return validity. Othewrise check:
+ *
+ * * If "__version_ext_crcs" is present, "__version_ext_names" is present
+ * * There is a name present for every crc
+ *
+ * Then populate:
+ *
+ * * &load_info->index.vers
+ * * &load_info->index.vers_ext_crc
+ * * &load_info->index.vers_ext_names
+ *
+ * if present.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_versions(struct load_info *info, int flags)
+{
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
+ size_t crc_count;
+ size_t remaining_len;
+ size_t name_size;
+ char *name;
+
+ /* If modversions were suppressed, pretend we didn't find any */
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS) {
+ info->index.vers = 0;
+ info->index.vers_ext_crc = 0;
+ info->index.vers_ext_name = 0;
+ return 0;
+ }
+
+ vers_ext_crc = find_sec(info, "__version_ext_crcs");
+ vers_ext_name = find_sec(info, "__version_ext_names");
+
+ /* If we have one field, we must have the other */
+ if (!!vers_ext_crc != !!vers_ext_name) {
+ pr_err("extended version crc+name presence does not match");
+ return -ENOEXEC;
+ }
+
+ /*
+ * If we have extended version information, we should have the same
+ * number of entries in every section.
+ */
+ if (vers_ext_crc) {
+ crc_count = info->sechdrs[vers_ext_crc].sh_size / sizeof(u32);
+ name = (void *)info->hdr +
+ info->sechdrs[vers_ext_name].sh_offset;
+ remaining_len = info->sechdrs[vers_ext_name].sh_size;
+
+ while (crc_count--) {
+ name_size = strnlen(name, remaining_len) + 1;
+ if (name_size > remaining_len) {
+ pr_err("more extended version crcs than names");
+ return -ENOEXEC;
+ }
+ remaining_len -= name_size;
+ name += name_size;
+ }
+ }
+
+ info->index.vers = find_sec(info, "__versions");
+ info->index.vers_ext_crc = vers_ext_crc;
+ info->index.vers_ext_name = vers_ext_name;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index() - Resolve, validate, cache section indices
+ * @info: Load info to read from and update.
+ * &load_info->sechdrs and &load_info->secstrings must be populated.
+ * @flags: Load flags, relevant to suppress version loading, see
+ * uapi/linux/module.h
+ *
+ * Populates &load_info->index, validating as it goes.
+ * See child functions for per-field validation:
+ *
+ * * elf_validity_cache_index_info()
+ * * elf_validity_cache_index_mod()
+ * * elf_validity_cache_index_sym()
+ * * elf_validity_cache_index_str()
+ * * elf_validity_cache_index_versions()
+ *
+ * If CONFIG_SMP is enabled, load the percpu section by name with no
+ * validation.
+ *
+ * Return: 0 on success, negative error code if an index failed validation.
+ */
+static int elf_validity_cache_index(struct load_info *info, int flags)
+{
+ int err;
+
+ err = elf_validity_cache_index_info(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_mod(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_sym(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_str(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_versions(info, flags);
+ if (err < 0)
+ return err;
+
+ info->index.pcpu = find_pcpusec(info);
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_strtab() - Validate and cache symbol string table
+ * @info: Load info to read from and update.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * Must have &load_info->index populated.
+ *
+ * Checks:
+ *
+ * * The string table is not empty.
+ * * The string table starts and ends with NUL (required by ELF spec).
+ * * Every &Elf_Sym->st_name offset in the symbol table is inbounds of the
+ * string table.
+ *
+ * And caches the pointer as &load_info->strtab in @info.
+ *
+ * Return: 0 on success, negative error code if a check failed.
+ */
+static int elf_validity_cache_strtab(struct load_info *info)
+{
+ Elf_Shdr *str_shdr = &info->sechdrs[info->index.str];
+ Elf_Shdr *sym_shdr = &info->sechdrs[info->index.sym];
+ char *strtab = (char *)info->hdr + str_shdr->sh_offset;
+ Elf_Sym *syms = (void *)info->hdr + sym_shdr->sh_offset;
+ int i;
+
+ if (str_shdr->sh_size == 0) {
+ pr_err("empty symbol string table\n");
+ return -ENOEXEC;
+ }
+ if (strtab[0] != '\0') {
+ pr_err("symbol string table missing leading NUL\n");
+ return -ENOEXEC;
+ }
+ if (strtab[str_shdr->sh_size - 1] != '\0') {
+ pr_err("symbol string table isn't NUL terminated\n");
+ return -ENOEXEC;
+ }
+
+ /*
+ * Now that we know strtab is correctly structured, check symbol
+ * starts are inbounds before they're used later.
+ */
+ for (i = 0; i < sym_shdr->sh_size / sizeof(*syms); i++) {
+ if (syms[i].st_name >= str_shdr->sh_size) {
+ pr_err("symbol name out of bounds in string table");
+ return -ENOEXEC;
+ }
+ }
+
+ info->strtab = strtab;
+ return 0;
+}
+
+/*
+ * Check userspace passed ELF module against our expectations, and cache
+ * useful variables for further processing as we go.
+ *
+ * This does basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ *
+ * As a last step, since we're already checking the ELF sections we cache
+ * useful variables which will be used later for our convenience:
+ *
+ * o pointers to section headers
+ * o cache the modinfo symbol section
+ * o cache the string symbol section
+ * o cache the module section
+ *
+ * As a last step we set info->mod to the temporary copy of the module in
+ * info->hdr. The final one will be allocated in move_module(). Any
+ * modifications we make to our copy of the module will be carried over
+ * to the final minted module.
+ */
+static int elf_validity_cache_copy(struct load_info *info, int flags)
+{
+ int err;
+
+ err = elf_validity_cache_sechdrs(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_secstrings(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index(info, flags);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_strtab(info);
+ if (err < 0)
+ return err;
+
+ /* This is temporary: point mod into copy of data. */
+ info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+
+ /*
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = info->mod->name;
+
+ return 0;
+}
+
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+ if (!get_modinfo(info, "livepatch"))
+ /* Nothing more to do */
+ return 0;
+
+ if (set_livepatch_module(mod))
+ return 0;
+
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
+}
+
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
+{
+ int err;
+
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ err = security_kernel_load_data(LOADING_MODULE, true);
+ if (err)
+ return err;
+
+ /* Suck in entire file: we'll want most of it. */
+ info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
+ if (!info->hdr)
+ return -ENOMEM;
+
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = security_kernel_post_load_data((char *)info->hdr, info->len,
+ LOADING_MODULE, "init_module");
+out:
+ if (err)
+ vfree(info->hdr);
+
+ return err;
+}
+
+static void free_copy(struct load_info *info, int flags)
+{
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ module_decompress_cleanup(info);
+ else
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info, int flags)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+
+ /*
+ * Mark all sections sh_addr with their address in the
+ * temporary image.
+ */
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+
+ }
+
+ /* Track but don't keep modinfo and version sections. */
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_crc].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_name].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ return 0;
+}
+
+static const char *const module_license_offenders[] = {
+ /* driverloader was caught wrongly pretending to be under GPL */
+ "driverloader",
+
+ /* lve claims to be GPL but upstream won't provide source */
+ "lve",
+};
+
+/*
+ * These calls taint the kernel depending certain module circumstances */
+static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
+{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+ size_t i;
+
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
+
+ check_modinfo_retpoline(mod, info);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
+ }
+
+ if (is_livepatch_module(mod)) {
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
+ }
+
+ module_license_taint_check(mod, get_modinfo(info, "license"));
+
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
+ }
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+
+ for (i = 0; i < ARRAY_SIZE(module_license_offenders); ++i) {
+ if (strcmp(mod->name, module_license_offenders[i]) == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
+
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ err = try_to_force_load(mod, "bad vermagic");
+ if (err)
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+ pr_err("%s: version magic '%s' should be '%s'\n",
+ info->name, modmagic, vermagic);
+ return -ENOEXEC;
+ }
+
+ err = check_modinfo_livepatch(mod, info);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ if (!mod->ctors)
+ mod->ctors = section_objs(info, ".init_array",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ else if (find_sec(info, ".init_array")) {
+ /*
+ * This shouldn't happen with same compiler and binutils
+ * building all parts of the module.
+ */
+ pr_warn("%s: has both .ctors and .init_array.\n",
+ mod->name);
+ return -EINVAL;
+ }
+#endif
+
+ mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
+ &mod->noinstr_text_size);
+
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_TREE_SRCU
+ mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ sizeof(*mod->srcu_struct_ptrs),
+ &mod->num_srcu_structs);
+#endif
+#ifdef CONFIG_BPF_EVENTS
+ mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ sizeof(*mod->bpf_raw_events),
+ &mod->num_bpf_raw_events);
+#endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+ mod->btf_base_data = any_section_objs(info, ".BTF.base", 1,
+ &mod->btf_base_data_size);
+#endif
+#ifdef CONFIG_JUMP_LABEL
+ mod->jump_entries = section_objs(info, "__jump_table",
+ sizeof(*mod->jump_entries),
+ &mod->num_jump_entries);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
+#endif
+#ifdef CONFIG_TRACING
+ mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+ sizeof(*mod->trace_bprintk_fmt_start),
+ &mod->num_trace_bprintk_fmt);
+#endif
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ sizeof(*mod->ei_funcs),
+ &mod->num_ei_funcs);
+#endif
+#ifdef CONFIG_KPROBES
+ mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
+ &mod->kprobes_text_size);
+ mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
+ sizeof(unsigned long),
+ &mod->num_kprobe_blacklist);
+#endif
+#ifdef CONFIG_PRINTK_INDEX
+ mod->printk_index_start = section_objs(info, ".printk_index",
+ sizeof(*mod->printk_index_start),
+ &mod->printk_index_size);
+#endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ mod->static_call_sites = section_objs(info, ".static_call_sites",
+ sizeof(*mod->static_call_sites),
+ &mod->num_static_call_sites);
+#endif
+#if IS_ENABLED(CONFIG_KUNIT)
+ mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+ sizeof(*mod->kunit_suites),
+ &mod->num_kunit_suites);
+ mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+ sizeof(*mod->kunit_init_suites),
+ &mod->num_kunit_init_suites);
+#endif
+
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+
+ if (section_addr(info, "__obsparm"))
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+ mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+ sizeof(*mod->dyndbg_info.descs),
+ &mod->dyndbg_info.num_descs);
+ mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*mod->dyndbg_info.classes),
+ &mod->dyndbg_info.num_classes);
+#endif
+
+ return 0;
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i, ret;
+ enum mod_mem_type t = MOD_MEM_NUM_TYPES;
+ bool codetag_section_found = false;
+
+ for_each_mod_mem_type(type) {
+ if (!mod->mem[type].size) {
+ mod->mem[type].base = NULL;
+ continue;
+ }
+
+ ret = module_memory_alloc(mod, type);
+ if (ret) {
+ t = type;
+ goto out_err;
+ }
+ }
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ pr_debug("Final section addresses for %s:\n", mod->name);
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ const char *sname;
+
+ if (!(shdr->sh_flags & SHF_ALLOC))
+ continue;
+
+ sname = info->secstrings + shdr->sh_name;
+ /*
+ * Load codetag sections separately as they might still be used
+ * after module unload.
+ */
+ if (codetag_needs_module_section(mod, sname, shdr->sh_size)) {
+ dest = codetag_alloc_module_section(mod, sname, shdr->sh_size,
+ arch_mod_section_prepend(mod, i), shdr->sh_addralign);
+ if (WARN_ON(!dest)) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ if (IS_ERR(dest)) {
+ ret = PTR_ERR(dest);
+ goto out_err;
+ }
+ codetag_section_found = true;
+ } else {
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+
+ dest = mod->mem[type].base + offset;
+ }
+
+ if (shdr->sh_type != SHT_NOBITS) {
+ /*
+ * Our ELF checker already validated this, but let's
+ * be pedantic and make the goal clearer. We actually
+ * end up copying over all modifications made to the
+ * userspace copy of the entire struct module.
+ */
+ if (i == info->index.mod &&
+ (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
+ ret = -ENOEXEC;
+ goto out_err;
+ }
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+ }
+ /*
+ * Update the userspace copy's ELF section address to point to
+ * our newly allocated memory as a pure convenience so that
+ * users of info can keep taking advantage and using the newly
+ * minted official memory area.
+ */
+ shdr->sh_addr = (unsigned long)dest;
+ pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
+ (long)shdr->sh_size, info->secstrings + shdr->sh_name);
+ }
+
+ return 0;
+out_err:
+ module_memory_restore_rox(mod);
+ while (t--)
+ module_memory_free(mod, t);
+ if (codetag_section_found)
+ codetag_free_module_sections(mod);
+
+ return ret;
+}
+
+static int check_export_symbol_versions(struct module *mod)
+{
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !mod->crcs) ||
+ (mod->num_gpl_syms && !mod->gpl_crcs)) {
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
+ }
+#endif
+ return 0;
+}
+
+static void flush_module_icache(const struct module *mod)
+{
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size) {
+ flush_icache_range((unsigned long)mod_mem->base,
+ (unsigned long)mod_mem->base + mod_mem->size);
+ }
+ }
+}
+
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(const char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
+static struct module *layout_and_allocate(struct load_info *info, int flags)
+{
+ struct module *mod;
+ int err;
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ /*
+ * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can
+ * put them in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings);
+
+ /*
+ * Determine total sizes, and put offsets in sh_entsize. For now
+ * this is done generically; there doesn't appear to be any
+ * special cases for the architectures.
+ */
+ layout_sections(info->mod, info);
+ layout_symtab(info->mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(info->mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ codetag_module_replaced(info->mod, mod);
+
+ return mod;
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ percpu_modfree(mod);
+ module_arch_freeing_init(mod);
+ codetag_free_module_sections(mod);
+
+ free_mod_mem(mod);
+}
+
+int __weak module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct llist_node node;
+ void *init_text;
+ void *init_data;
+ void *init_rodata;
+};
+
+static void do_free_init(struct work_struct *w)
+{
+ struct llist_node *pos, *n, *list;
+ struct mod_initfree *initfree;
+
+ list = llist_del_all(&init_free_list);
+
+ synchronize_rcu();
+
+ llist_for_each_safe(pos, n, list) {
+ initfree = container_of(pos, struct mod_initfree, node);
+ execmem_free(initfree->init_text);
+ execmem_free(initfree->init_data);
+ execmem_free(initfree->init_rodata);
+ kfree(initfree);
+ }
+}
+
+void flush_module_init_free_work(void)
+{
+ flush_work(&init_free_wq);
+}
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+/* Default value for module->async_probe_requested */
+static bool async_probe;
+module_param(async_probe, bool, 0644);
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
+{
+ int ret = 0;
+ struct mod_initfree *freeinit;
+#if defined(CONFIG_MODULE_STATS)
+ unsigned int text_size = 0, total_size = 0;
+
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+ if (mod_mem->size) {
+ total_size += mod_mem->size;
+ if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+ text_size += mod_mem->size;
+ }
+ }
+#endif
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+ freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+ freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ goto fail_free_freeinit;
+ }
+ if (ret > 0) {
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /* Delay uevent until module has finished its init routine */
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
+ *
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
+ */
+ if (!mod->async_probe_requested)
+ async_synchronize_full();
+
+ ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+ mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
+#endif
+ ret = module_enable_rodata_ro_after_init(mod);
+ if (ret)
+ pr_warn("%s: module_enable_rodata_ro_after_init() returned %d, "
+ "ro_after_init data might still be writable\n",
+ mod->name, ret);
+
+ mod_tree_remove_init(mod);
+ module_arch_freeing_init(mod);
+ for_class_mod_mem_type(type, init) {
+ mod->mem[type].base = NULL;
+ mod->mem[type].size = 0;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointers */
+ mod->btf_data = NULL;
+ mod->btf_base_data = NULL;
+#endif
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this within an RCU read section. In all the failure paths, we
+ * call synchronize_rcu(), but we don't want to slow down the success
+ * path. execmem_free() cannot be called in an interrupt, so do the
+ * work and call synchronize_rcu() in a work queue.
+ *
+ * Note that execmem_alloc() on most architectures creates W+X page
+ * mappings which won't be cleaned up until do_free_init() runs. Any
+ * code such as mark_rodata_ro() which depends on those mappings to
+ * be cleaned up needs to sync with the queued work by invoking
+ * flush_module_init_free_work().
+ */
+ if (llist_add(&freeinit->node, &init_free_list))
+ schedule_work(&init_free_wq);
+
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ mod_stat_add_long(text_size, &total_text_size);
+ mod_stat_add_long(total_size, &total_mod_size);
+
+ mod_stat_inc(&modcount);
+
+ return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_rcu();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+
+ return ret;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ struct module *old;
+ int err = 0;
+
+ old = find_module_all(name, strlen(name), true);
+ if (old == NULL)
+ return 0;
+
+ if (old->state == MODULE_STATE_COMING ||
+ old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(name));
+ mutex_lock(&module_mutex);
+ if (err)
+ return err;
+
+ /* The module might have gone in the meantime. */
+ old = find_module_all(name, strlen(name), true);
+ }
+
+ if (try_add_failed_module(name, reason))
+ pr_warn("Could not add fail-tracking for module: %s\n", name);
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ return -EEXIST;
+ return -EBUSY;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
+ if (err)
+ goto out;
+
+ mod_update_bounds(mod);
+ list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_exported_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* These rely on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
+
+ err = module_enable_rodata_ro(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_data_nx(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_text_rox(mod);
+ if (err)
+ goto out_strict_rwx;
+
+ /*
+ * Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us.
+ */
+ mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ return 0;
+
+out_strict_rwx:
+ module_bug_cleanup(mod);
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int prepare_coming_module(struct module *mod)
+{
+ int err;
+
+ ftrace_module_enable(mod);
+ err = klp_module_coming(mod);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain_robust(&module_notify_list,
+ MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+ err = notifier_to_errno(err);
+ if (err)
+ klp_module_going(mod);
+
+ return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ void *arg)
+{
+ struct module *mod = arg;
+ int ret;
+
+ if (strcmp(param, "async_probe") == 0) {
+ if (kstrtobool(val, &mod->async_probe_requested))
+ mod->async_probe_requested = true;
+ return 0;
+ }
+
+ /* Check for magic 'dyndbg' arg */
+ ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ return 0;
+}
+
+/* Module within temporary copy, this doesn't do any allocation */
+static int early_mod_check(struct load_info *info, int flags)
+{
+ int err;
+
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
+ if (blacklisted(info->name)) {
+ pr_err("Module %s is blacklisted\n", info->name);
+ return -EPERM;
+ }
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return err;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod))
+ return -ENOEXEC;
+
+ err = check_modinfo(info->mod, info, flags);
+ if (err)
+ return err;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
+ mutex_unlock(&module_mutex);
+
+ return err;
+}
+
+/*
+ * Allocate and load the module: note that size of section 0 is always
+ * zero, and we rely on this for optional sections.
+ */
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
+{
+ struct module *mod;
+ bool module_allocated = false;
+ long err = 0;
+ char *after_dashes;
+
+ /*
+ * Do the signature check (if any) first. All that
+ * the signature check needs is info->len, it does
+ * not need any of the section info. That can be
+ * set up later. This will minimize the chances
+ * of a corrupt module causing problems before
+ * we even get to the signature check.
+ *
+ * The check will also adjust info->len by stripping
+ * off the sig length at the end of the module, making
+ * checks against info->len more correct.
+ */
+ err = module_sig_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /*
+ * Do basic sanity checks against the ELF header and
+ * sections. Cache useful sections and set the
+ * info->mod to the userspace passed struct module.
+ */
+ err = elf_validity_cache_copy(info, flags);
+ if (err)
+ goto free_copy;
+
+ err = early_mod_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(info, flags);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ module_allocated = true;
+
+ audit_log_kern_module(info->name);
+
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
+ /*
+ * We are tainting your kernel if your module gets into
+ * the modules linked list somehow.
+ */
+ module_augment_kernel_taints(mod, info);
+
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = percpu_modalloc(mod, info);
+ if (err)
+ goto unlink_mod;
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto unlink_mod;
+
+ init_param_lock(mod);
+
+ /*
+ * Now we've got everything in the final locations, we can
+ * find optional sections.
+ */
+ err = find_module_sections(mod, info);
+ if (err)
+ goto free_unload;
+
+ err = check_export_symbol_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ err = setup_modinfo(mod, info);
+ if (err)
+ goto free_modinfo;
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ init_build_id(mod, info);
+
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
+
+ err = prepare_coming_module(mod);
+ if (err)
+ goto bug_cleanup;
+
+ mod->async_probe_requested = async_probe;
+
+ /* Module is ready to execute: parsing args may do that. */
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, mod,
+ unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
+ goto coming_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
+
+ /* Link in to sysfs. */
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+ if (err < 0)
+ goto coming_cleanup;
+
+ if (is_livepatch_module(mod)) {
+ err = copy_module_elf(mod, info);
+ if (err < 0)
+ goto sysfs_cleanup;
+ }
+
+ if (codetag_load_module(mod))
+ goto sysfs_cleanup;
+
+ /* Get rid of temporary copy. */
+ free_copy(info, flags);
+
+ /* Done! */
+ trace_module_load(mod);
+
+ return do_init_module(mod);
+
+ sysfs_cleanup:
+ mod_sysfs_teardown(mod);
+ coming_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ destroy_params(mod->kp, mod->num_kp);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ bug_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ /* module_bug_cleanup needs module_mutex protection */
+ mutex_lock(&module_mutex);
+ module_bug_cleanup(mod);
+ mutex_unlock(&module_mutex);
+
+ ddebug_cleanup:
+ ftrace_release_mod(mod);
+ synchronize_rcu();
+ kfree(mod->args);
+ free_arch_cleanup:
+ module_arch_cleanup(mod);
+ free_modinfo:
+ free_modinfo(mod);
+ free_unload:
+ module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ wake_up_all(&module_wq);
+ /* Wait for RCU-sched synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ mutex_unlock(&module_mutex);
+ free_module:
+ mod_stat_bump_invalid(info, flags);
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ for_class_mod_mem_type(type, core_data) {
+ lockdep_free_key_range(mod->mem[type].base,
+ mod->mem[type].size);
+ }
+
+ module_memory_restore_rox(mod);
+ module_deallocate(mod, info);
+ free_copy:
+ /*
+ * The info->len is always set. We distinguish between
+ * failures once the proper module was allocated and
+ * before that.
+ */
+ if (!module_allocated) {
+ audit_log_kern_module(info->name ? info->name : "?");
+ mod_stat_bump_becoming(info, flags);
+ }
+ free_copy(info, flags);
+ return err;
+}
+
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+ unsigned long, len, const char __user *, uargs)
+{
+ int err;
+ struct load_info info = { };
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ err = copy_module_from_user(umod, len, &info);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
+ return err;
+ }
+
+ return load_module(&info, uargs, 0);
+}
+
+struct idempotent {
+ const void *cookie;
+ struct hlist_node entry;
+ struct completion complete;
+ int ret;
+};
+
+#define IDEM_HASH_BITS 8
+static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
+static DEFINE_SPINLOCK(idem_lock);
+
+static bool idempotent(struct idempotent *u, const void *cookie)
+{
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct idempotent *existing;
+ bool first;
+
+ u->ret = -EINTR;
+ u->cookie = cookie;
+ init_completion(&u->complete);
+
+ spin_lock(&idem_lock);
+ first = true;
+ hlist_for_each_entry(existing, head, entry) {
+ if (existing->cookie != cookie)
+ continue;
+ first = false;
+ break;
+ }
+ hlist_add_head(&u->entry, idem_hash + hash);
+ spin_unlock(&idem_lock);
+
+ return !first;
+}
+
+/*
+ * We were the first one with 'cookie' on the list, and we ended
+ * up completing the operation. We now need to walk the list,
+ * remove everybody - which includes ourselves - fill in the return
+ * value, and then complete the operation.
+ */
+static int idempotent_complete(struct idempotent *u, int ret)
+{
+ const void *cookie = u->cookie;
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct hlist_node *next;
+ struct idempotent *pos;
+
+ spin_lock(&idem_lock);
+ hlist_for_each_entry_safe(pos, next, head, entry) {
+ if (pos->cookie != cookie)
+ continue;
+ hlist_del_init(&pos->entry);
+ pos->ret = ret;
+ complete(&pos->complete);
+ }
+ spin_unlock(&idem_lock);
+ return ret;
+}
+
+/*
+ * Wait for the idempotent worker.
+ *
+ * If we get interrupted, we need to remove ourselves from the
+ * the idempotent list, and the completion may still come in.
+ *
+ * The 'idem_lock' protects against the race, and 'idem.ret' was
+ * initialized to -EINTR and is thus always the right return
+ * value even if the idempotent work then completes between
+ * the wait_for_completion and the cleanup.
+ */
+static int idempotent_wait_for_completion(struct idempotent *u)
+{
+ if (wait_for_completion_interruptible(&u->complete)) {
+ spin_lock(&idem_lock);
+ if (!hlist_unhashed(&u->entry))
+ hlist_del(&u->entry);
+ spin_unlock(&idem_lock);
+ }
+ return u->ret;
+}
+
+static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
+{
+ bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE);
+ struct load_info info = { };
+ void *buf = NULL;
+ int len;
+ int err;
+
+ len = kernel_read_file(f, 0, &buf, INT_MAX, NULL,
+ compressed ? READING_MODULE_COMPRESSED :
+ READING_MODULE);
+ if (len < 0) {
+ mod_stat_inc(&failed_kreads);
+ return len;
+ }
+
+ if (compressed) {
+ err = module_decompress(&info, buf, len);
+ vfree(buf); /* compressed data is no longer needed */
+ if (err) {
+ mod_stat_inc(&failed_decompress);
+ mod_stat_add_long(len, &invalid_decompress_bytes);
+ return err;
+ }
+ err = security_kernel_post_read_file(f, (char *)info.hdr, info.len,
+ READING_MODULE);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ free_copy(&info, flags);
+ return err;
+ }
+ } else {
+ info.hdr = buf;
+ info.len = len;
+ }
+
+ return load_module(&info, uargs, flags);
+}
+
+static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
+{
+ struct idempotent idem;
+
+ if (!(f->f_mode & FMODE_READ))
+ return -EBADF;
+
+ /* Are we the winners of the race and get to do this? */
+ if (!idempotent(&idem, file_inode(f))) {
+ int ret = init_module_from_file(f, uargs, flags);
+ return idempotent_complete(&idem, ret);
+ }
+
+ /*
+ * Somebody else won the race and is loading the module.
+ */
+ return idempotent_wait_for_completion(&idem);
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
+ return -EINVAL;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ return idempotent_init_module(fd_file(f), uargs, flags);
+}
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
+char *module_flags(struct module *mod, char *buf, bool show_state)
+{
+ int bx = 0;
+
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ if (!mod->taints && !show_state)
+ goto out;
+ if (mod->taints ||
+ mod->state == MODULE_STATE_GOING ||
+ mod->state == MODULE_STATE_COMING) {
+ buf[bx++] = '(';
+ bx += module_flags_taint(mod->taints, buf + bx);
+ /* Show a - for module-is-being-unloaded */
+ if (mod->state == MODULE_STATE_GOING && show_state)
+ buf[bx++] = '-';
+ /* Show a + for module-is-being-loaded */
+ if (mod->state == MODULE_STATE_COMING && show_state)
+ buf[bx++] = '+';
+ buf[bx++] = ')';
+ }
+out:
+ buf[bx] = '\0';
+
+ return buf;
+}
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address(addr);
+ if (!mod)
+ return NULL;
+
+ if (!mod->num_exentries)
+ return NULL;
+ /*
+ * The address passed here belongs to a module that is currently
+ * invoked (we are running inside it). Therefore its module::refcnt
+ * needs already be >0 to ensure that it is not removed at this stage.
+ * All other user need to invoke this function within a RCU read
+ * section.
+ */
+ return search_extable(mod->extable, mod->num_exentries, addr);
+}
+
+/**
+ * is_module_address() - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
+ */
+bool is_module_address(unsigned long addr)
+{
+ guard(rcu)();
+ return __module_address(addr) != NULL;
+}
+
+/**
+ * __module_address() - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called within RCU read section or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
+{
+ struct module *mod;
+
+ if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+ goto lookup;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+ goto lookup;
+#endif
+
+ return NULL;
+
+lookup:
+ mod = mod_find(addr, &mod_tree);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
+ if (mod->state == MODULE_STATE_UNFORMED)
+ mod = NULL;
+ }
+ return mod;
+}
+
+/**
+ * is_module_text_address() - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module. See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+ guard(rcu)();
+ return __module_text_address(addr) != NULL;
+}
+
+void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (func(mod, data))
+ break;
+ }
+}
+
+/**
+ * __module_text_address() - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called within RCU read section or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod = __module_address(addr);
+ if (mod) {
+ /* Make sure it's within the text section. */
+ if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+ !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
+ mod = NULL;
+ }
+ return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ printk(KERN_DEFAULT "Modules linked in:");
+ /* Most callers should already have preempt disabled, but make sure */
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
+ }
+
+ print_unloaded_tainted_modules();
+ if (last_unloaded_module.name[0])
+ pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
+ last_unloaded_module.taints);
+ pr_cont("\n");
+}
+
+#ifdef CONFIG_MODULE_DEBUGFS
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+ mod_debugfs_root = debugfs_create_dir("modules", NULL);
+ return 0;
+}
+module_init(module_debugfs_init);
+#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..0a4841e88adb
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init && !mod->exit) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static unsigned int module_total_size(struct module *mod)
+{
+ int size = 0;
+
+ for_each_mod_mem_type(type)
+ size += mod->mem[type].size;
+ return size;
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ void *value;
+ unsigned int size;
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ size = module_total_size(mod);
+ seq_printf(m, "%s %u", mod->name, size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ value = m->private ? NULL : mod->mem[MOD_TEXT].base;
+ seq_printf(m, " 0x%px", value);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf, true));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ }
+
+ return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &modules_proc_ops);
+ return 0;
+}
+module_init(proc_modules_init);
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..a2ff4242e623
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <linux/security.h>
+#include <crypto/public_key.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+ struct module_signature ms;
+ size_t sig_len, modlen = info->len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+ ret = mod_check_sig(&ms, modlen, "module");
+ if (ret)
+ return ret;
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ modlen -= sig_len + sizeof(ms);
+ info->len = modlen;
+
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+}
+
+int module_sig_check(struct load_info *info, int flags)
+{
+ int err = -ENODATA;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
+ const void *mod = info->hdr;
+ bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+ MODULE_INIT_IGNORE_VERMAGIC);
+ /*
+ * Do not allow mangled modules as a module with version information
+ * removed is no longer the module that was signed.
+ */
+ if (!mangled_module &&
+ info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+ }
+
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
+ switch (err) {
+ case -ENODATA:
+ reason = "unsigned module";
+ break;
+ case -ENOPKG:
+ reason = "module with unsupported crypto";
+ break;
+ case -ENOKEY:
+ reason = "module with unavailable key";
+ break;
+
+ default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
+ return err;
+ }
+
+ if (is_module_sig_enforced()) {
+ pr_notice("Loading of %s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..3ba0e98b3c91
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Debugging module statistics.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <uapi/linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include <linux/math.h>
+
+#include "internal.h"
+
+/**
+ * DOC: module debugging statistics overview
+ *
+ * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
+ * are useful to monitor and root cause memory pressure issues with module
+ * loading. These statistics are useful to allow us to improve production
+ * workloads.
+ *
+ * The current module debugging statistics supported help keep track of module
+ * loading failures to enable improvements either for kernel module auto-loading
+ * usage (request_module()) or interactions with userspace. Statistics are
+ * provided to track all possible failures in the finit_module() path and memory
+ * wasted in this process space. Each of the failure counters are associated
+ * to a type of module loading failure which is known to incur a certain amount
+ * of memory allocation loss. In the worst case loading a module will fail after
+ * a 3 step memory allocation process:
+ *
+ * a) memory allocated with kernel_read_file_from_fd()
+ * b) module decompression processes the file read from
+ * kernel_read_file_from_fd(), and vmap() is used to map
+ * the decompressed module to a new local buffer which represents
+ * a copy of the decompressed module passed from userspace. The buffer
+ * from kernel_read_file_from_fd() is freed right away.
+ * c) layout_and_allocate() allocates space for the final resting
+ * place where we would keep the module if it were to be processed
+ * successfully.
+ *
+ * If a failure occurs after these three different allocations only one
+ * counter will be incremented with the summation of the allocated bytes freed
+ * incurred during this failure. Likewise, if module loading failed only after
+ * step b) a separate counter is used and incremented for the bytes freed and
+ * not used during both of those allocations.
+ *
+ * Virtual memory space can be limited, for example on x86 virtual memory size
+ * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
+ * memory allocations when possible. These module debugging statistics help
+ * to evaluate how much memory is being wasted on bootup due to module loading
+ * failures.
+ *
+ * All counters are designed to be incremental. Atomic counters are used so to
+ * remain simple and avoid delays and deadlocks.
+ */
+
+/**
+ * DOC: dup_failed_modules - tracks duplicate failed modules
+ *
+ * Linked list of modules which failed to be loaded because an already existing
+ * module with the same name was already being processed or already loaded.
+ * The finit_module() system call incurs heavy virtual memory allocations. In
+ * the worst case an finit_module() system call can end up allocating virtual
+ * memory 3 times:
+ *
+ * 1) kernel_read_file_from_fd() call uses vmalloc()
+ * 2) optional module decompression uses vmap()
+ * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
+ * vmalloc to deal with ELF sections requiring special permissions
+ *
+ * In practice on a typical boot today most finit_module() calls fail due to
+ * the module with the same name already being loaded or about to be processed.
+ * All virtual memory allocated to these failed modules will be freed with
+ * no functional use.
+ *
+ * To help with this the dup_failed_modules allows us to track modules which
+ * failed to load due to the fact that a module was already loaded or being
+ * processed. There are only two points at which we can fail such calls,
+ * we list them below along with the number of virtual memory allocation
+ * calls:
+ *
+ * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
+ * layout_and_allocate().
+ * - with module decompression: 2 virtual memory allocation calls
+ * - without module decompression: 1 virtual memory allocation calls
+ * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
+ * - with module decompression 3 virtual memory allocation calls
+ * - without module decompression 2 virtual memory allocation calls
+ *
+ * We should strive to get this list to be as small as possible. If this list
+ * is not empty it is a reflection of possible work or optimizations possible
+ * either in-kernel or in userspace.
+ */
+static LIST_HEAD(dup_failed_modules);
+
+/**
+ * DOC: module statistics debugfs counters
+ *
+ * The total amount of wasted virtual memory allocation space during module
+ * loading can be computed by adding the total from the summation:
+ *
+ * * @invalid_kread_bytes +
+ * @invalid_decompress_bytes +
+ * @invalid_becoming_bytes +
+ * @invalid_mod_bytes
+ *
+ * The following debugfs counters are available to inspect module loading
+ * failures:
+ *
+ * * total_mod_size: total bytes ever used by all modules we've dealt with on
+ * this system
+ * * total_text_size: total bytes of the .text and .init.text ELF section
+ * sizes we've dealt with on this system
+ * * invalid_kread_bytes: bytes allocated and then freed on failures which
+ * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
+ * uses vmalloc(). These should typically not happen unless your system is
+ * under memory pressure.
+ * * invalid_decompress_bytes: number of bytes allocated and freed due to
+ * memory allocations in the module decompression path that use vmap().
+ * These typically should not happen unless your system is under memory
+ * pressure.
+ * * invalid_becoming_bytes: total number of bytes allocated and freed used
+ * to read the kernel module userspace wants us to read before we
+ * promote it to be processed to be added to our @modules linked list. These
+ * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
+ * call and right before we allocate the our private memory for the module
+ * which would be kept if the module is successfully loaded. The most common
+ * reason for this failure is when userspace is racing to load a module
+ * which it does not yet see loaded. The first module to succeed in
+ * add_unformed_module() will add a module to our &modules list and
+ * subsequent loads of modules with the same name will error out at the
+ * end of early_mod_check(). The check for module_patient_check_exists()
+ * at the end of early_mod_check() prevents duplicate allocations
+ * on layout_and_allocate() for modules already being processed. These
+ * duplicate failed modules are non-fatal, however they typically are
+ * indicative of userspace not seeing a module in userspace loaded yet and
+ * unnecessarily trying to load a module before the kernel even has a chance
+ * to begin to process prior requests. Although duplicate failures can be
+ * non-fatal, we should try to reduce vmalloc() pressure proactively, so
+ * ideally after boot this will be close to as 0 as possible. If module
+ * decompression was used we also add to this counter the cost of the
+ * initial kernel_read_file_from_fd() of the compressed module. If module
+ * decompression was not used the value represents the total allocated and
+ * freed bytes in kernel_read_file_from_fd() calls for these type of
+ * failures. These failures can occur because:
+ *
+ * * module_sig_check() - module signature checks
+ * * elf_validity_cache_copy() - some ELF validation issue
+ * * early_mod_check():
+ *
+ * * blacklisting
+ * * failed to rewrite section headers
+ * * version magic
+ * * live patch requirements didn't check out
+ * * the module was detected as being already present
+ *
+ * * invalid_mod_bytes: these are the total number of bytes allocated and
+ * freed due to failures after we did all the sanity checks of the module
+ * which userspace passed to us and after our first check that the module
+ * is unique. A module can still fail to load if we detect the module is
+ * loaded after we allocate space for it with layout_and_allocate(), we do
+ * this check right before processing the module as live and run its
+ * initialization routines. Note that you have a failure of this type it
+ * also means the respective kernel_read_file_from_fd() memory space was
+ * also freed and not used, and so we increment this counter with twice
+ * the size of the module. Additionally if you used module decompression
+ * the size of the compressed module is also added to this counter.
+ *
+ * * modcount: how many modules we've loaded in our kernel life time
+ * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
+ * * failed_decompress: how many failed module decompression attempts we've had.
+ * These really should not happen unless your compression / decompression
+ * might be broken.
+ * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
+ * it and before we allocate memory for it with layout_and_allocate(). This
+ * counter is never incremented if you manage to validate the module and
+ * call layout_and_allocate() for it.
+ * * failed_load_modules: how many modules failed once we've allocated our
+ * private space for our module using layout_and_allocate(). These failures
+ * should hopefully mostly be dealt with already. Races in theory could
+ * still exist here, but it would just mean the kernel had started processing
+ * two threads concurrently up to early_mod_check() and one thread won.
+ * These failures are good signs the kernel or userspace is doing something
+ * seriously stupid or that could be improved. We should strive to fix these,
+ * but it is perhaps not easy to fix them. A recent example are the modules
+ * requests incurred for frequency modules, a separate module request was
+ * being issued for each CPU on a system.
+ */
+
+atomic_long_t total_mod_size;
+atomic_long_t total_text_size;
+atomic_long_t invalid_kread_bytes;
+atomic_long_t invalid_decompress_bytes;
+static atomic_long_t invalid_becoming_bytes;
+static atomic_long_t invalid_mod_bytes;
+atomic_t modcount;
+atomic_t failed_kreads;
+atomic_t failed_decompress;
+static atomic_t failed_becoming;
+static atomic_t failed_load_modules;
+
+static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
+{
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
+ test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Becoming & Load";
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
+ return "Becoming";
+ if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Load";
+ return "Bug-on-stats";
+}
+
+void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+ atomic_long_add(info->len * 2, &invalid_mod_bytes);
+ atomic_inc(&failed_load_modules);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_mod_bytes);
+#endif
+}
+
+void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+ atomic_inc(&failed_becoming);
+ atomic_long_add(info->len, &invalid_becoming_bytes);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
+#endif
+}
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
+{
+ struct mod_fail_load *mod_fail;
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_fail->name, name)) {
+ atomic_long_inc(&mod_fail->count);
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ goto out;
+ }
+ }
+
+ mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
+ if (!mod_fail)
+ return -ENOMEM;
+ memcpy(mod_fail->name, name, strlen(name));
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ atomic_long_inc(&mod_fail->count);
+ list_add_rcu(&mod_fail->list, &dup_failed_modules);
+out:
+ return 0;
+}
+
+/*
+ * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
+ * 112 module prints within 8k.
+ *
+ * 1024 + (64*112) = 8k
+ */
+#define MAX_PREAMBLE 1024
+#define MAX_FAILED_MOD_PRINT 112
+#define MAX_BYTES_PER_MOD 64
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct mod_fail_load *mod_fail;
+ unsigned int len, size, count_failed = 0;
+ char *buf;
+ int ret;
+ u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
+ unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
+ idecompress_bytes, imod_bytes, total_virtual_lost;
+
+ live_mod_count = atomic_read(&modcount);
+ fkreads = atomic_read(&failed_kreads);
+ fdecompress = atomic_read(&failed_decompress);
+ fbecoming = atomic_read(&failed_becoming);
+ floads = atomic_read(&failed_load_modules);
+
+ total_size = atomic_long_read(&total_mod_size);
+ text_size = atomic_long_read(&total_text_size);
+ ikread_bytes = atomic_long_read(&invalid_kread_bytes);
+ idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
+ ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
+ imod_bytes = atomic_long_read(&invalid_mod_bytes);
+
+ total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
+
+ size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
+ (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* The beginning of our debug preamble */
+ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
+ fdecompress);
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
+ idecompress_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
+
+ if (live_mod_count && total_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
+ DIV_ROUND_UP(total_size, live_mod_count));
+ }
+
+ if (live_mod_count && text_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
+ DIV_ROUND_UP(text_size, live_mod_count));
+ }
+
+ /*
+ * We use WARN_ON_ONCE() for the counters to ensure we always have parity
+ * for keeping tabs on a type of failure with one type of byte counter.
+ * The counters for imod_bytes does not increase for fkreads failures
+ * for example, and so on.
+ */
+
+ WARN_ON_ONCE(ikread_bytes && !fkreads);
+ if (fkreads && ikread_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
+ DIV_ROUND_UP(ikread_bytes, fkreads));
+ }
+
+ WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
+ if (fbecoming && ibecoming_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
+ DIV_ROUND_UP(ibecoming_bytes, fbecoming));
+ }
+
+ WARN_ON_ONCE(idecompress_bytes && !fdecompress);
+ if (fdecompress && idecompress_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
+ DIV_ROUND_UP(idecompress_bytes, fdecompress));
+ }
+
+ WARN_ON_ONCE(imod_bytes && !floads);
+ if (floads && imod_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
+ DIV_ROUND_UP(imod_bytes, floads));
+ }
+
+ /* End of our debug preamble header. */
+
+ /* Catch when we've gone beyond our expected preamble */
+ WARN_ON_ONCE(len >= MAX_PREAMBLE);
+
+ if (list_empty(&dup_failed_modules))
+ goto out;
+
+ len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
+ len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
+ "Module-name", "How-many-times", "Reason");
+ mutex_lock(&module_mutex);
+
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
+ if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
+ goto out_unlock;
+ len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
+ atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
+ }
+out_unlock:
+ mutex_unlock(&module_mutex);
+out:
+ ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
+ kfree(buf);
+ return ret;
+}
+#undef MAX_PREAMBLE
+#undef MAX_FAILED_MOD_PRINT
+#undef MAX_BYTES_PER_MOD
+
+static const struct file_operations fops_mod_stats = {
+ .read = read_file_mod_stats,
+ .open = simple_open,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
+#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
+static int __init module_stats_init(void)
+{
+ mod_debug_add_ulong(total_mod_size);
+ mod_debug_add_ulong(total_text_size);
+ mod_debug_add_ulong(invalid_kread_bytes);
+ mod_debug_add_ulong(invalid_decompress_bytes);
+ mod_debug_add_ulong(invalid_becoming_bytes);
+ mod_debug_add_ulong(invalid_mod_bytes);
+
+ mod_debug_add_atomic(modcount);
+ mod_debug_add_atomic(failed_kreads);
+ mod_debug_add_atomic(failed_decompress);
+ mod_debug_add_atomic(failed_becoming);
+ mod_debug_add_atomic(failed_load_modules);
+
+ debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+ return 0;
+}
+#undef mod_debug_add_ulong
+#undef mod_debug_add_atomic
+module_init(module_stats_init);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..8fd438529fbc
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include <linux/execmem.h>
+#include "internal.h"
+
+static int module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (!mod_mem->base)
+ return 0;
+
+ set_vm_flush_reset_perms(mod_mem->base);
+ return set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+int module_enable_text_rox(const struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ const struct module_memory *mem = &mod->mem[type];
+ int ret;
+
+ if (mem->is_rox)
+ ret = execmem_restore_rox(mem->base, mem->size);
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ ret = module_set_memory(mod, type, set_memory_rox);
+ else
+ ret = module_set_memory(mod, type, set_memory_x);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int module_enable_rodata_ro(const struct module *mod)
+{
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
+
+ ret = module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+ ret = module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int module_enable_rodata_ro_after_init(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
+
+ return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+}
+
+int module_enable_data_nx(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for_class_mod_mem_type(type, data) {
+ int ret = module_set_memory(mod, type, set_memory_nx);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+ pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+ mod->name, secstrings + sechdrs[i].sh_name, i);
+ return -ENOEXEC;
+ }
+ }
+
+ return 0;
+}
+
+static const char *const ro_after_init[] = {
+ /*
+ * Section .data..ro_after_init holds data explicitly annotated by
+ * __ro_after_init.
+ */
+ ".data..ro_after_init",
+
+ /*
+ * Section __jump_table holds data structures that are never modified,
+ * with the exception of entries that refer to code in the __init
+ * section, which are marked as such at module load time.
+ */
+ "__jump_table",
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ /*
+ * Section .static_call_sites holds data structures that need to be
+ * sorted and processed at module load time but are never modified
+ * afterwards.
+ */
+ ".static_call_sites",
+#endif
+};
+
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ int i, j;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &sechdrs[i];
+
+ for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) {
+ if (strcmp(secstrings + shdr->sh_name,
+ ro_after_init[j]) == 0) {
+ shdr->sh_flags |= SHF_RO_AFTER_INIT;
+ break;
+ }
+ }
+ }
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..c7622ff5226a
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attrs {
+ struct attribute_group grp;
+ struct bin_attribute attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ const struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
+{
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? battr->private : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ const struct bin_attribute *const *bin_attr;
+
+ for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++)
+ kfree((*bin_attr)->attr.name);
+ kfree(sect_attrs->grp.bin_attrs);
+ kfree(sect_attrs);
+}
+
+static int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ struct module_sect_attrs *sect_attrs;
+ const struct bin_attribute **gattr;
+ struct bin_attribute *sattr;
+ unsigned int nloaded = 0, i;
+ int ret;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ sect_attrs = kzalloc(struct_size(sect_attrs, attrs, nloaded), GFP_KERNEL);
+ if (!sect_attrs)
+ return -ENOMEM;
+
+ gattr = kcalloc(nloaded + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(sect_attrs);
+ return -ENOMEM;
+ }
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.bin_attrs = gattr;
+
+ sattr = &sect_attrs->attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+
+ if (sect_empty(sec))
+ continue;
+ sysfs_bin_attr_init(sattr);
+ sattr->attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (!sattr->attr.name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sattr->read = module_sect_read;
+ sattr->private = (void *)sec->sh_addr;
+ sattr->size = MODULE_SECT_READ_SIZE;
+ sattr->attr.mode = 0400;
+ *(gattr++) = sattr++;
+ }
+
+ ret = sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp);
+ if (ret)
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return 0;
+out:
+ free_sect_attrs(sect_attrs);
+ return ret;
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct attribute_group grp;
+ struct bin_attribute attrs[];
+};
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs)
+{
+ kfree(notes_attrs->grp.bin_attrs);
+ kfree(notes_attrs);
+}
+
+static int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ const struct bin_attribute **gattr;
+ struct bin_attribute *nattr;
+ int ret;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ info->sechdrs[i].sh_type == SHT_NOTE)
+ ++notes;
+
+ if (notes == 0)
+ return 0;
+
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ GFP_KERNEL);
+ if (!notes_attrs)
+ return -ENOMEM;
+
+ gattr = kcalloc(notes + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(notes_attrs);
+ return -ENOMEM;
+ }
+
+ notes_attrs->grp.name = "notes";
+ notes_attrs->grp.bin_attrs = gattr;
+
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].attr.name;
+ nattr->attr.mode = 0444;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *)info->sechdrs[i].sh_addr;
+ nattr->read = sysfs_bin_attr_simple_read;
+ *(gattr++) = nattr++;
+ }
+ ++loaded;
+ }
+
+ ret = sysfs_create_group(&mod->mkobj.kobj, &notes_attrs->grp);
+ if (ret)
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return 0;
+
+out:
+ free_notes_attrs(notes_attrs);
+ return ret;
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->notes_attrs->grp);
+ /*
+ * We are positive that no one is using any notes attrs
+ * at this point. Deallocate immediately.
+ */
+ free_notes_attrs(mod->notes_attrs);
+ mod->notes_attrs = NULL;
+ }
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+ int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
+#endif
+ return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+ const struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ const struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (modinfo_attrs_count + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (!attr->test || attr->test(mod)) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ if (error)
+ goto error_out;
+ ++temp_attr;
+ }
+ }
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
+ return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_kset) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
+ err = add_sect_attrs(mod, info);
+ if (err)
+ goto out_del_usage_links;
+
+ err = add_notes_attrs(mod, info);
+ if (err)
+ goto out_unreg_sect_attrs;
+
+ return 0;
+
+out_unreg_sect_attrs:
+ remove_sect_attrs(mod);
+out_del_usage_links:
+ del_usage_links(mod);
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod, -1);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..4fefec5b683c
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root;
+
+int try_add_tainted_module(struct module *mod)
+{
+ struct mod_unload_taint *mod_taint;
+
+ if (!mod->taints)
+ goto out;
+
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_taint->name, mod->name) &&
+ mod_taint->taints & mod->taints) {
+ mod_taint->count++;
+ goto out;
+ }
+ }
+
+ mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+ if (unlikely(!mod_taint))
+ return -ENOMEM;
+ strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+ mod_taint->taints = mod->taints;
+ list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+ mod_taint->count = 1;
+out:
+ return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ if (!list_empty(&unloaded_tainted_modules)) {
+ printk(KERN_DEFAULT "Unloaded tainted modules:");
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+ list) {
+ size_t l;
+
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+ pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+ mod_taint->count);
+ }
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
+}
+
+static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
+}
+
+static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ size_t l;
+
+ mod_taint = list_entry(p, struct mod_unload_taint, list);
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+
+ seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations unloaded_tainted_modules_seq_ops = {
+ .start = unloaded_tainted_modules_seq_start,
+ .next = unloaded_tainted_modules_seq_next,
+ .stop = unloaded_tainted_modules_seq_stop,
+ .show = unloaded_tainted_modules_seq_show,
+};
+
+static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unloaded_tainted_modules_seq_ops);
+}
+
+static const struct file_operations unloaded_tainted_modules_fops = {
+ .open = unloaded_tainted_modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init unloaded_tainted_modules_init(void)
+{
+ debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
+ &unloaded_tainted_modules_fops);
+ return 0;
+}
+module_init(unloaded_tainted_modules_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..f8e8c126705c
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING || CFI because those can
+ * really hit __module_address() hard by doing a lot of stack unwinding;
+ * potentially from NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ mod->mem[type].mtn.mod = mod;
+ if (mod->mem[type].size)
+ __mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+ for_class_mod_mem_type(type, init) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..2beefeba82d9
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const u32 *crc)
+{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+ struct modversion_info_ext version_ext;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* If we have extended version info, rely on it */
+ if (info->index.vers_ext_crc) {
+ for_each_modversion_info_ext(version_ext, info) {
+ if (strcmp(version_ext.name, symname) != 0)
+ continue;
+ if (*version_ext.crc == *crc)
+ return 1;
+ pr_debug("Found checksum %X vs module %X\n",
+ *crc, *version_ext.crc);
+ goto bad_version;
+ }
+ pr_warn_once("%s: no extended symbol version for %s\n",
+ info->name, symname);
+ return 1;
+ }
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *)sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ crcval = *crc;
+ if (versions[i].crc == crcval)
+ return 1;
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
+ goto bad_version;
+ }
+
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ return 1;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+ return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
+ bool have_symbol;
+
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary. Regardless use a RCU read section to keep
+ * lockdep happy.
+ */
+ scoped_guard(rcu)
+ have_symbol = find_symbol(&fsa);
+ BUG_ON(!have_symbol);
+
+ return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+
+void modversion_ext_start(const struct load_info *info,
+ struct modversion_info_ext *start)
+{
+ unsigned int crc_idx = info->index.vers_ext_crc;
+ unsigned int name_idx = info->index.vers_ext_name;
+ Elf_Shdr *sechdrs = info->sechdrs;
+
+ /*
+ * Both of these fields are needed for this to be useful
+ * Any future fields should be initialized to NULL if absent.
+ */
+ if (crc_idx == 0 || name_idx == 0) {
+ start->remaining = 0;
+ return;
+ }
+
+ start->crc = (const u32 *)sechdrs[crc_idx].sh_addr;
+ start->name = (const char *)sechdrs[name_idx].sh_addr;
+ start->remaining = sechdrs[crc_idx].sh_size / sizeof(*start->crc);
+}
+
+void modversion_ext_advance(struct modversion_info_ext *vers)
+{
+ vers->remaining--;
+ vers->crc++;
+ vers->name += strlen(vers->name) + 1;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index 8723ae70ea1f..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/module_signature.h>
-#include <linux/string.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
-#include "module-internal.h"
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, struct load_info *info)
-{
- struct module_signature ms;
- size_t sig_len, modlen = info->len;
- int ret;
-
- pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
- if (modlen <= sizeof(ms))
- return -EBADMSG;
-
- memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-
- ret = mod_check_sig(&ms, modlen, "module");
- if (ret)
- return ret;
-
- sig_len = be32_to_cpu(ms.sig_len);
- modlen -= sig_len + sizeof(ms);
- info->len = modlen;
-
- return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
-}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ba005ebf4730..2f9fe7c30287 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,14 +5,9 @@
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-/*
- * Notifier list for kernel code which wants to be called
- * at shutdown. This is used to stop any idling DMA operations
- * and the like.
- */
-BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
/*
* Notifier chain core routines. The exported routines below
@@ -20,7 +15,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n,
+ bool unique_priority)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
@@ -30,10 +26,13 @@ static int notifier_chain_register(struct notifier_block **nl,
}
if (n->priority > (*nl)->priority)
break;
+ if (n->priority == (*nl)->priority && unique_priority)
+ return -EBUSY;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
+ trace_notifier_register((void *)n->notifier_call);
return 0;
}
@@ -43,6 +42,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
+ trace_notifier_unregister((void *)n->notifier_call);
return 0;
}
nl = &((*nl)->next);
@@ -59,7 +59,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -81,6 +81,7 @@ static int notifier_call_chain(struct notifier_block **nl,
continue;
}
#endif
+ trace_notifier_run((void *)nb->notifier_call);
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
@@ -102,13 +103,13 @@ NOKPROBE_SYMBOL(notifier_call_chain);
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
- * @v Pointer passed unmodified to the notifier function
+ * @v: Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
- * Returns: the return value of the @val_up call.
+ * Return: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
@@ -144,13 +145,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
+ * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n, true);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
+
+/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
@@ -204,23 +228,27 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
+/**
+ * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ * @nh: Pointer to head of the atomic notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
+{
+ return !rcu_access_pointer(nh->head);
+}
+
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
-/**
- * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain.
- * Must be called in process context.
- *
- * Returns 0 on success, %-EEXIST on error.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n,
+ bool unique_priority)
{
int ret;
@@ -230,16 +258,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, unique_priority);
down_write(&nh->rwsem);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, unique_priority);
up_write(&nh->rwsem);
return ret;
}
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, false);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
+ * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an blocking notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, true);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
+
+/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
@@ -341,7 +402,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
@@ -393,7 +454,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -420,10 +480,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
mutex_lock(&nh->mutex);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
mutex_unlock(&nh->mutex);
return ret;
}
@@ -510,8 +570,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..bdc3c86231d3
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#include <linux/ns_common.h>
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+ break;
+#endif
+ case CLONE_NEWNS:
+ VFS_WARN_ON_ONCE(ops != &mntns_operations);
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ VFS_WARN_ON_ONCE(ops != &netns_operations);
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ VFS_WARN_ON_ONCE(ops != &pidns_operations);
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ VFS_WARN_ON_ONCE(ops != &timens_operations);
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ VFS_WARN_ON_ONCE(ops != &userns_operations);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ VFS_WARN_ON_ONCE(ops != &utsns_operations);
+ break;
+#endif
+ }
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+ int ret = 0;
+
+ refcount_set(&ns->__ns_ref, 1);
+ ns->stashed = NULL;
+ ns->ops = ops;
+ ns->ns_id = 0;
+ ns->ns_type = ns_type;
+ ns_tree_node_init(&ns->ns_tree_node);
+ ns_tree_node_init(&ns->ns_unified_node);
+ ns_tree_node_init(&ns->ns_owner_node);
+ ns_tree_root_init(&ns->ns_owner_root);
+
+#ifdef CONFIG_DEBUG_VFS
+ ns_debug(ns, ops);
+#endif
+
+ if (inum)
+ ns->inum = inum;
+ else
+ ret = proc_alloc_inum(&ns->inum);
+ if (ret)
+ return ret;
+ /*
+ * Tree ref starts at 0. It's incremented when namespace enters
+ * active use (installed in nsproxy) and decremented when all
+ * active uses are gone. Initial namespaces are always active.
+ */
+ if (is_ns_init_inum(ns))
+ atomic_set(&ns->__ns_ref_active, 1);
+ else
+ atomic_set(&ns->__ns_ref_active, 0);
+ return 0;
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+ proc_free_inum(ns->inum);
+}
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+ struct user_namespace *owner;
+
+ if (unlikely(!ns->ops))
+ return NULL;
+ VFS_WARN_ON_ONCE(!ns->ops->owner);
+ owner = ns->ops->owner(ns);
+ VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+ if (!owner)
+ return NULL;
+ /* Skip init_user_ns as it's always active */
+ if (owner == &init_user_ns)
+ return NULL;
+ return to_ns_common(owner);
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put(struct ns_common *ns)
+{
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+ }
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - - -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ * net_ns pid_ns
+ * \ /
+ * + -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - + -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_get(struct ns_common *ns)
+{
+ int prev;
+
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ /* If we didn't resurrect the namespace we're done. */
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+
+ /*
+ * We did resurrect it. Walk the ownership hierarchy upwards
+ * until we found an owning user namespace that is active.
+ */
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+ }
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index eec72ca962e2..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,11 +26,12 @@
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
+#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
@@ -55,16 +56,35 @@ static inline struct nsproxy *create_nsproxy(void)
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
- atomic_set(&nsproxy->count, 1);
+ refcount_set(&nsproxy->count, 1);
return nsproxy;
}
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+ nsproxy_ns_active_put(ns);
+ nsproxy_free(ns);
+}
+
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
-static struct nsproxy *create_new_namespaces(unsigned long flags,
+static struct nsproxy *create_new_namespaces(u64 flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
{
@@ -128,17 +148,13 @@ out_time:
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
- if (new_nsp->pid_ns_for_children)
- put_pid_ns(new_nsp->pid_ns_for_children);
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
- if (new_nsp->ipc_ns)
- put_ipc_ns(new_nsp->ipc_ns);
+ put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
- if (new_nsp->uts_ns)
- put_uts_ns(new_nsp->uts_ns);
+ put_uts_ns(new_nsp->uts_ns);
out_uts:
- if (new_nsp->mnt_ns)
- put_mnt_ns(new_nsp->mnt_ns);
+ put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
@@ -148,7 +164,7 @@ out_ns:
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(u64 flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
@@ -157,7 +173,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
- if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
@@ -179,31 +196,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- timens_on_fork(new_ns, tsk);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
+ nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
-void free_nsproxy(struct nsproxy *ns)
-{
- if (ns->mnt_ns)
- put_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns_for_children)
- put_pid_ns(ns->pid_ns_for_children);
- if (ns->time_ns)
- put_time_ns(ns->time_ns);
- if (ns->time_ns_for_children)
- put_time_ns(ns->time_ns_for_children);
- put_cgroup_ns(ns->cgroup_ns);
- put_net(ns->net_ns);
- kmem_cache_free(nsproxy_cachep, ns);
-}
-
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
@@ -240,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ if (new)
+ nsproxy_ns_active_get(new);
+
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
@@ -249,11 +252,44 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
put_nsproxy(ns);
}
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+ ns_ref_active_get(new->user_ns);
+ ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
+int exec_task_namespaces(void)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new;
+
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
+
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -306,7 +342,7 @@ static void put_nsset(struct nsset *nsset)
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
free_fs_struct(nsset->fs);
if (nsset->nsproxy)
- free_nsproxy(nsset->nsproxy);
+ nsproxy_free(nsset->nsproxy);
}
static int prepare_nsset(unsigned flags, struct nsset *nsset)
@@ -526,21 +562,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ CLASS(fd, f)(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (fd_empty(f))
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
- if (flags && (ns->ops->type != flags))
+ if (proc_ns_file(fd_file(f))) {
+ ns = get_proc_ns(file_inode(fd_file(f)));
+ if (flags && (ns->ns_type != flags))
err = -EINVAL;
- flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ flags = ns->ns_type;
+ } else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -552,17 +587,16 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(fd_file(f)))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
return err;
}
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..f36c59e6951d
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,813 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/rculist.h>
+#include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
+
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+
+DEFINE_LOCK_GUARD_0(ns_tree_writer,
+ write_seqlock(&ns_tree_lock),
+ write_sequnlock(&ns_tree_lock))
+
+DEFINE_LOCK_GUARD_0(ns_tree_locked_reader,
+ read_seqlock_excl(&ns_tree_lock),
+ read_sequnlock_excl(&ns_tree_lock))
+
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
+};
+
+struct ns_tree_root mnt_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root net_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree_root uts_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root user_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root ipc_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root pid_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root cgroup_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root time_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
+};
+
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+ RB_CLEAR_NODE(&node->ns_node);
+ INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+ root->ns_rb = RB_ROOT;
+ INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+ return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node *ret, *prev;
+
+ /* Add to rbtree */
+ ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+ /* Add to list in sorted order */
+ prev = rb_prev(&node->ns_node);
+ if (!prev) {
+ /* No previous node, add at head */
+ list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+ } else {
+ /* Add after previous node */
+ struct ns_tree_node *prev_node;
+ prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+ list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+ rb_erase(&node->ns_node, &root->ns_rb);
+ RB_CLEAR_NODE(&node->ns_node);
+ list_bidir_del_rcu(&node->ns_list_entry);
+}
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
+}
+
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
+}
+
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+ if (id_a < id_b)
+ return -1;
+ if (id_a > id_b)
+ return 1;
+ return 0;
+}
+
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ struct rb_node *node;
+ const struct proc_ns_operations *ops = ns->ops;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ guard(ns_tree_writer)();
+
+ /* Add to per-type tree and list */
+ node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
+
+ /* Add to unified tree and list */
+ ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
+
+ /* Add to owner's tree if applicable */
+ if (ops) {
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(!ops->owner);
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ /* Insert into owner's tree and list */
+ ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
+ } else {
+ /* Only the initial user namespace doesn't have an owner. */
+ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+ }
+ }
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ const struct proc_ns_operations *ops = ns->ops;
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
+
+ write_seqlock(&ns_tree_lock);
+
+ /* Remove from per-type tree and list */
+ ns_tree_node_del(&ns->ns_tree_node, ns_tree);
+
+ /* Remove from unified tree and list */
+ ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
+
+ /* Remove from owner's tree if applicable */
+ if (ops) {
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
+ }
+ }
+
+ write_sequnlock(&ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns_unified(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
+{
+ struct rb_node *node;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree_root *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns(node);
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ if (ns_type)
+ return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+ return __ns_unified_tree_lookup_rcu(ns_id);
+}
+
+/**
+ * __ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @ns_tree: namespace tree to search in
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree_root *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+ if (list_is_head(list, &ns_tree->ns_list_head))
+ return ERR_PTR(-ENOENT);
+
+ return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
+}
+
+/**
+ * __ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
+{
+ static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+ if (id)
+ ns->ns_id = id;
+ else
+ ns->ns_id = atomic64_inc_return(&namespace_cookie);
+ return ns->ns_id;
+}
+
+struct klistns {
+ u64 __user *uns_ids;
+ u32 nr_ns_ids;
+ u64 last_ns_id;
+ u64 user_ns_id;
+ u32 ns_type;
+ struct user_namespace *user_ns;
+ bool userns_capable;
+ struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+ if (kls->user_ns_id != LISTNS_CURRENT_USER)
+ put_user_ns(kls->user_ns);
+ if (kls->first_ns && kls->first_ns->ops)
+ kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+ struct ns_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ if (kreq->ns_type & ~NS_ALL)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+ u64 __user *ns_ids, size_t nr_ns_ids)
+{
+ kls->last_ns_id = kreq->ns_id;
+ kls->user_ns_id = kreq->user_ns_id;
+ kls->nr_ns_ids = nr_ns_ids;
+ kls->ns_type = kreq->ns_type;
+ kls->uns_ids = ns_ids;
+ return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+ struct ns_common *ret = NULL;
+ struct rb_node *node;
+
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ guard(ns_tree_locked_reader)();
+
+ node = owner->ns_owner_root.ns_rb.rb_node;
+ while (node) {
+ struct ns_common *ns;
+
+ ns = node_to_ns_owner(node);
+ if (ns_id <= ns->ns_id) {
+ ret = ns;
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+ struct ns_common *ns;
+
+ guard(rcu)();
+ ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+ if (!ns)
+ return NULL;
+
+ if (!ns_get_unless_inactive(ns))
+ return NULL;
+
+ return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+ const struct ns_common *ns)
+{
+ return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+ struct ns_common *ns)
+{
+ if (kls->user_ns) {
+ if (kls->userns_capable)
+ return true;
+ } else {
+ struct ns_common *owner;
+ struct user_namespace *user_ns;
+
+ owner = ns_owner(ns);
+ if (owner)
+ user_ns = to_user_ns(owner);
+ else
+ user_ns = &init_user_ns;
+ if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+ return true;
+ }
+
+ if (is_current_namespace(ns))
+ return true;
+
+ if (ns->ns_type != CLONE_NEWUSER)
+ return false;
+
+ if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ return true;
+
+ return false;
+}
+
+static inline void ns_put(struct ns_common *ns)
+{
+ if (ns && ns->ops)
+ ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+ struct ns_common *candidate)
+{
+ struct ns_common *ns __free(ns_put) = NULL;
+
+ if (!ns_requested(kls, candidate))
+ return NULL;
+
+ ns = ns_get_unless_inactive(candidate);
+ if (!ns)
+ return NULL;
+
+ if (!may_list_ns(kls, ns))
+ return NULL;
+
+ return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
+ const struct list_head *head;
+ ssize_t ret;
+
+ VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+ if (kls->user_ns_id == LISTNS_CURRENT_USER)
+ ns = to_ns_common(current_user_ns());
+ else if (kls->user_ns_id)
+ ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+ if (!ns)
+ return -EINVAL;
+ kls->user_ns = to_user_ns(ns);
+
+ /*
+ * Use the rbtree to find the first namespace we care about and
+ * then use it's list entry to iterate from there.
+ */
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
+ kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry);
+
+ ns = first_ns;
+ list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) {
+ struct ns_common *valid;
+
+ if (!nr_ns_ids)
+ break;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+ struct ns_common *ret = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ struct rb_node *node;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+ }
+
+ guard(ns_tree_locked_reader)();
+
+ if (ns_tree)
+ node = ns_tree->ns_rb.rb_node;
+ else
+ node = ns_unified_root.ns_rb.rb_node;
+
+ while (node) {
+ struct ns_common *ns;
+
+ if (ns_type)
+ ns = node_to_ns(node);
+ else
+ ns = node_to_ns_unified(node);
+
+ if (ns_id <= ns->ns_id) {
+ if (ns_type)
+ ret = node_to_ns(node);
+ else
+ ret = node_to_ns_unified(node);
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+ const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return &ns->ns_tree_node.ns_list_entry == head;
+ return &ns->ns_unified_node.ns_list_entry == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns, *first_ns = NULL, *prev = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ const struct list_head *head;
+ u32 ns_type;
+ ssize_t ret;
+
+ if (hweight32(kls->ns_type) == 1)
+ ns_type = kls->ns_type;
+ else
+ ns_type = 0;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return -EINVAL;
+ }
+
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ if (ns_tree)
+ head = &ns_tree->ns_list_head;
+ else
+ head = &ns_unified_root.ns_list_head;
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = first_ns_common(head, ns_tree);
+
+ for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+ ns = next_ns_common(ns, ns_tree)) {
+ struct ns_common *valid;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+ u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+ struct klistns klns __free(klistns_free) = {};
+ const size_t maxcount = 1000000;
+ struct ns_id_req kreq;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_ns_ids > maxcount))
+ return -EOVERFLOW;
+
+ if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+ return -EFAULT;
+
+ ret = copy_ns_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+ ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+ if (ret)
+ return ret;
+
+ if (kreq.user_ns_id)
+ return do_listns_userns(&klns);
+
+ return do_listns(&klns);
+}
diff --git a/kernel/padata.c b/kernel/padata.c
index 18d3a5c699d8..aa66d91e20f9 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -47,15 +47,20 @@ struct padata_mt_job_state {
static void padata_free_pd(struct parallel_data *pd);
static void __init padata_mt_helper(struct work_struct *work);
-static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+static inline void padata_get_pd(struct parallel_data *pd)
{
- int cpu, target_cpu;
+ refcount_inc(&pd->refcnt);
+}
- target_cpu = cpumask_first(pd->cpumask.pcpu);
- for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
+static inline void padata_put_pd_cnt(struct parallel_data *pd, int cnt)
+{
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
+}
- return target_cpu;
+static inline void padata_put_pd(struct parallel_data *pd)
+{
+ padata_put_pd_cnt(pd, 1);
}
static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
@@ -66,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
*/
int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
- return padata_index_to_cpu(pd, cpu_index);
+ return cpumask_nth(cpu_index, pd->cpumask.pcpu);
}
static struct padata_work *padata_work_alloc(void)
@@ -83,8 +88,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -98,7 +111,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data,
{
int i;
- spin_lock(&padata_works_lock);
+ spin_lock_bh(&padata_works_lock);
/* Start at 1 because the current task participates in the job. */
for (i = 1; i < nworks; ++i) {
struct padata_work *pw = padata_work_alloc();
@@ -108,7 +121,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data,
padata_work_init(pw, padata_mt_helper, data, 0);
list_add(&pw->pw_list, head);
}
- spin_unlock(&padata_works_lock);
+ spin_unlock_bh(&padata_works_lock);
return i;
}
@@ -126,12 +139,12 @@ static void __init padata_works_free(struct list_head *works)
if (list_empty(works))
return;
- spin_lock(&padata_works_lock);
+ spin_lock_bh(&padata_works_lock);
list_for_each_entry_safe(cur, next, works, pw_list) {
list_del(&cur->pw_list);
padata_work_free(cur);
}
- spin_unlock(&padata_works_lock);
+ spin_unlock_bh(&padata_works_lock);
}
static void padata_parallel_worker(struct work_struct *parallel_work)
@@ -168,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, err;
struct parallel_data *pd;
struct padata_work *pw;
+ int cpu_index, err;
rcu_read_lock_bh();
@@ -181,24 +194,19 @@ int padata_do_parallel(struct padata_shell *ps,
goto out;
if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
- if (!cpumask_weight(pd->cpumask.cbcpu))
+ if (cpumask_empty(pd->cpumask.cbcpu))
goto out;
/* Select an alternate fallback CPU and notify the caller. */
cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
-
- cpu = cpumask_first(pd->cpumask.cbcpu);
- for (i = 0; i < cpu_index; i++)
- cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
-
- *cb_cpu = cpu;
+ *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu);
}
- err = -EBUSY;
+ err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
goto out;
- refcount_inc(&pd->refcnt);
+ padata_get_pd(pd);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
@@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
rcu_read_unlock_bh();
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -235,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel);
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
*/
-static struct padata_priv *padata_find_next(struct parallel_data *pd,
- bool remove_object)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+ unsigned int processed)
{
struct padata_priv *padata;
struct padata_list *reorder;
- int cpu = pd->cpu;
reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
- if (list_empty(&reorder->list)) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
+ if (list_empty(&reorder->list))
+ goto notfound;
padata = list_entry(reorder->list.next, struct padata_priv, list);
@@ -256,88 +263,56 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
* Checks the rare case where two or more parallel jobs have hashed to
* the same CPU and one of the later ones finishes first.
*/
- if (padata->seq_nr != pd->processed) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
-
- if (remove_object) {
- list_del_init(&padata->list);
- ++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
- }
+ if (padata->seq_nr != processed)
+ goto notfound;
+ list_del_init(&padata->list);
spin_unlock(&reorder->lock);
return padata;
+
+notfound:
+ pd->processed = processed;
+ pd->cpu = cpu;
+ spin_unlock(&reorder->lock);
+ return NULL;
}
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
{
+ struct parallel_data *pd = padata->pd;
struct padata_instance *pinst = pd->ps->pinst;
- int cb_cpu;
- struct padata_priv *padata;
- struct padata_serial_queue *squeue;
- struct padata_list *reorder;
+ unsigned int processed;
+ int cpu;
- /*
- * We need to ensure that only one cpu can work on dequeueing of
- * the reorder queue the time. Calculating in which percpu reorder
- * queue the next object will arrive takes some time. A spinlock
- * would be highly contended. Also it is not clear in which order
- * the objects arrive to the reorder queues. So a cpu could wait to
- * get the lock just to notice that there is nothing to do at the
- * moment. Therefore we use a trylock and let the holder of the lock
- * care for all the objects enqueued during the holdtime of the lock.
- */
- if (!spin_trylock_bh(&pd->lock))
- return;
+ processed = pd->processed;
+ cpu = pd->cpu;
- while (1) {
- padata = padata_find_next(pd, true);
+ do {
+ struct padata_serial_queue *squeue;
+ int cb_cpu;
- /*
- * If the next object that needs serialization is parallel
- * processed by another cpu and is still on it's way to the
- * cpu's reorder queue, nothing to do for now.
- */
- if (!padata)
- break;
+ processed++;
+ /* When sequence wraps around, reset to the first CPU. */
+ if (unlikely(processed == 0))
+ cpu = cpumask_first(pd->cpumask.pcpu);
+ else
+ cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
- spin_unlock(&squeue->serial.lock);
-
queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
- }
-
- spin_unlock_bh(&pd->lock);
-
- /*
- * The next object that needs serialization might have arrived to
- * the reorder queues in the meantime.
- *
- * Ensure reorder queue is read after pd->lock is dropped so we see
- * new objects from another task in padata_do_serial. Pairs with
- * smp_mb in padata_do_serial.
- */
- smp_mb();
-
- reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
- if (!list_empty(&reorder->list) && padata_find_next(pd, false))
- queue_work(pinst->serial_wq, &pd->reorder_work);
-}
-
-static void invoke_padata_reorder(struct work_struct *work)
-{
- struct parallel_data *pd;
- local_bh_disable();
- pd = container_of(work, struct parallel_data, reorder_work);
- padata_reorder(pd);
- local_bh_enable();
+ /*
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, end the loop.
+ */
+ padata = padata_find_next(pd, cpu, processed);
+ spin_unlock(&squeue->serial.lock);
+ } while (padata);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -370,8 +345,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (refcount_sub_and_test(cnt, &pd->refcnt))
- padata_free_pd(pd);
+ padata_put_pd_cnt(pd, cnt);
}
/**
@@ -388,23 +362,25 @@ void padata_do_serial(struct padata_priv *padata)
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
+ bool gotit = true;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &reorder->list, list)
- if (cur->seq_nr < padata->seq_nr)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
+ /* Compare by difference to consider integer wrap around */
+ if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
break;
- list_add(&padata->list, &cur->list);
+ }
+ if (padata->seq_nr != pd->processed) {
+ gotit = false;
+ list_add(&padata->list, pos);
+ }
spin_unlock(&reorder->lock);
- /*
- * Ensure the addition to the reorder list is ordered correctly
- * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
- * in padata_reorder.
- */
- smp_mb();
-
- padata_reorder(pd);
+ if (gotit)
+ padata_reorder(padata);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -472,13 +448,14 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
- int nworks;
+ int nworks, nid;
+ static atomic_t last_used_nid __initdata;
if (job->size == 0)
return;
/* Ensure at least one thread when size < min_chunk. */
- nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
nworks = min(nworks, job->max_threads);
if (nworks == 1) {
@@ -498,13 +475,25 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
* thread function. Load balance large jobs between threads by
* increasing the number of chunks, guarantee at least the minimum
* chunk size from the caller, and honor the caller's alignment.
+ * Ensure chunk_size is at least 1 to prevent divide-by-0
+ * panic in padata_mt_helper().
*/
ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = max(ps.chunk_size, 1ul);
ps.chunk_size = roundup(ps.chunk_size, job->align);
list_for_each_entry(pw, &works, pw_list)
- queue_work(system_unbound_wq, &pw->pw_work);
+ if (job->numa_aware) {
+ int old_node = atomic_read(&last_used_nid);
+
+ do {
+ nid = next_node_in(old_node, node_states[N_CPU]);
+ } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+ queue_work_node(nid, system_dfl_wq, &pw->pw_work);
+ } else {
+ queue_work(system_dfl_wq, &pw->pw_work);
+ }
/* Use the current thread, which saves starting a workqueue worker. */
padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
@@ -517,12 +506,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
padata_works_free(&works);
}
-static void __padata_list_init(struct padata_list *pd_list)
-{
- INIT_LIST_HEAD(&pd_list->list);
- spin_lock_init(&pd_list->lock);
-}
-
/* Initialize all percpu queues used by serial workers */
static void padata_init_squeues(struct parallel_data *pd)
{
@@ -532,7 +515,8 @@ static void padata_init_squeues(struct parallel_data *pd)
for_each_cpu(cpu, pd->cpumask.cbcpu) {
squeue = per_cpu_ptr(pd->squeue, cpu);
squeue->pd = pd;
- __padata_list_init(&squeue->serial);
+ INIT_LIST_HEAD(&squeue->serial.list);
+ spin_lock_init(&squeue->serial.lock);
INIT_WORK(&squeue->work, padata_serial_worker);
}
}
@@ -545,7 +529,8 @@ static void padata_init_reorder_list(struct parallel_data *pd)
for_each_cpu(cpu, pd->cpumask.pcpu) {
list = per_cpu_ptr(pd->reorder_list, cpu);
- __padata_list_init(list);
+ INIT_LIST_HEAD(&list->list);
+ spin_lock_init(&list->lock);
}
}
@@ -581,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_squeues(pd);
pd->seq_nr = -1;
refcount_set(&pd->refcnt, 1);
- spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
- INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -654,8 +637,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (refcount_dec_and_test(&ps->opd->refcnt))
- padata_free_pd(ps->opd);
+ padata_put_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -943,7 +925,7 @@ static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
pinst = kobj2pinst(kobj);
pentry = attr2pentry(attr);
- if (pentry->show)
+ if (pentry->store)
ret = pentry->store(pinst, attr, buf, count);
return ret;
@@ -954,7 +936,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
.default_groups = padata_default_groups,
.release = padata_sysfs_release,
@@ -981,8 +963,9 @@ struct padata_instance *padata_alloc(const char *name)
cpus_read_lock();
- pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
- WQ_CPU_INTENSIVE, 1, name);
+ pinst->serial_wq = alloc_workqueue("%s_serial",
+ WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE | WQ_PERCPU,
+ 1, name);
if (!pinst->serial_wq)
goto err_put_cpus;
@@ -1089,12 +1072,15 @@ EXPORT_SYMBOL(padata_alloc_shell);
*/
void padata_free_shell(struct padata_shell *ps)
{
+ struct parallel_data *pd;
+
if (!ps)
return;
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
- padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ pd = rcu_dereference_protected(ps->pd, 1);
+ padata_put_pd(pd);
mutex_unlock(&ps->pinst->lock);
kfree(ps);
diff --git a/kernel/panic.c b/kernel/panic.c
index 55b50e052ec3..0d52210a9e2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -32,6 +33,10 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
+#include <linux/seq_buf.h>
+#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -43,12 +48,14 @@
* Should we dump all CPUs backtraces in an oops event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
- IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -56,22 +63,188 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
+static bool panic_console_replay;
+
+bool panic_triggering_all_cpu_backtrace;
+static bool panic_this_cpu_backtrace_printed;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
-#define PANIC_PRINT_TASK_INFO 0x00000001
-#define PANIC_PRINT_MEM_INFO 0x00000002
-#define PANIC_PRINT_TIMER_INFO 0x00000004
-#define PANIC_PRINT_LOCK_INFO 0x00000008
-#define PANIC_PRINT_FTRACE_INFO 0x00000010
-#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+static void panic_print_deprecated(void)
+{
+ pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
+}
+
+#ifdef CONFIG_SYSCTL
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ int i;
+
+ /*
+ * If we are relying on panic_on_taint not producing
+ * false positives due to userspace input, bail out
+ * before setting the requested taint flags.
+ */
+ if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
+ return -EINVAL;
+
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++)
+ if ((1UL << i) & tmptaint)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+
+ return err;
+}
+
+static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ panic_print_deprecated();
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static const struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = proc_taint,
+ },
+ {
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_print",
+ .data = &panic_print,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = sysctl_panic_print_handler,
+ },
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "panic_sys_info",
+ .data = &panic_print,
+ .maxlen = sizeof(panic_print),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
+/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
+static int __init setup_panic_sys_info(char *buf)
+{
+ /* There is no risk of race in kernel boot phase */
+ panic_print = sys_info_parse_param(buf);
+ return 1;
+}
+__setup("panic_sys_info=", setup_panic_sys_info);
+
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -84,7 +257,7 @@ EXPORT_SYMBOL(panic_blink);
/*
* Stop ourself in panic -- architecture code may override this
*/
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
{
while (1)
cpu_relax();
@@ -94,7 +267,7 @@ void __weak panic_smp_self_stop(void)
* Stop ourselves in NMI context if another CPU has already panicked. Arch code
* may override this to prepare for crash dumping, e.g. save regs info.
*/
-void __weak nmi_panic_self_stop(struct pt_regs *regs)
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
panic_smp_self_stop();
}
@@ -127,6 +300,59 @@ void __weak crash_smp_send_stop(void)
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+bool panic_try_start(void)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
+
+ return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
+}
+EXPORT_SYMBOL(panic_try_start);
+
+void panic_reset(void)
+{
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_reset);
+
+bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_in_progress);
+
+/* Return true if a panic is in progress on the current CPU. */
+bool panic_on_this_cpu(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+EXPORT_SYMBOL(panic_on_this_cpu);
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool panic_on_other_cpu(void)
+{
+ return (panic_in_progress() && !panic_on_this_cpu());
+}
+EXPORT_SYMBOL(panic_on_other_cpu);
+
/*
* A variant of panic() called from NMI context. We return if we've already
* panicked on this CPU. If another CPU already panicked, loop in
@@ -135,56 +361,87 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, cpu;
-
- cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
-
- if (old_cpu == PANIC_CPU_INVALID)
+ if (panic_try_start())
panic("%s", msg);
- else if (old_cpu != cpu)
+ else if (panic_on_other_cpu())
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(void)
+void check_panic_on_warn(const char *origin)
{
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
- if (panic_print & PANIC_PRINT_TASK_INFO)
- show_state();
+static void panic_trigger_all_cpu_backtrace(void)
+{
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
- if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem(0, NULL);
+ if (panic_this_cpu_backtrace_printed)
+ trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
+ else
+ trigger_all_cpu_backtrace();
- if (panic_print & PANIC_PRINT_TIMER_INFO)
- sysrq_timer_list_show();
+ panic_triggering_all_cpu_backtrace = false;
+}
- if (panic_print & PANIC_PRINT_LOCK_INFO)
- debug_show_all_locks();
+/*
+ * Helper that triggers the NMI backtrace (if set in panic_print)
+ * and then performs the secondary CPUs shutdown - we cannot have
+ * the NMI backtrace after the CPUs are off!
+ */
+static void panic_other_cpus_shutdown(bool crash_kexec)
+{
+ if (panic_print & SYS_INFO_ALL_BT)
+ panic_trigger_all_cpu_backtrace();
- if (panic_print & PANIC_PRINT_FTRACE_INFO)
- ftrace_dump(DUMP_ALL);
+ /*
+ * Note that smp_send_stop() is the usual SMP shutdown function,
+ * which unfortunately may not be hardened to work in a panic
+ * situation. If we want to do crash dump after notifier calls
+ * and kmsg_dump, we will need architecture dependent extra
+ * bits in addition to stopping other CPUs, hence we rely on
+ * crash_smp_send_stop() for that.
+ */
+ if (!crash_kexec)
+ smp_send_stop();
+ else
+ crash_smp_send_stop();
}
/**
- * panic - halt the system
- * @fmt: The text string to print
+ * vpanic - halt the system
+ * @fmt: The text string to print
+ * @args: Arguments for the format string
*
- * Display a message, then perform cleanups.
- *
- * This function never returns.
+ * Display a message, then perform cleanups. This function never returns.
*/
-void panic(const char *fmt, ...)
+void vpanic(const char *fmt, va_list args)
{
static char buf[1024];
- va_list args;
long i, i_next = 0, len;
int state = 0;
- int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -204,34 +461,34 @@ void panic(const char *fmt, ...)
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
- * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
- * comes here, so go ahead.
+ * cmpxchg success means this is the 1st CPU which comes here,
+ * so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
-
- if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (panic_try_start()) {
+ /* go ahead */
+ } else if (panic_on_other_cpu())
panic_smp_self_stop();
console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ panic_this_cpu_backtrace_printed = true;
+ } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
-#endif
+ panic_this_cpu_backtrace_printed = true;
+ }
/*
* If kgdb is enabled, give it a chance to run before we stop all
@@ -248,23 +505,12 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!_crash_kexec_post_notifiers) {
+ if (!_crash_kexec_post_notifiers)
__crash_kexec(NULL);
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a
- * panic situation.
- */
- smp_send_stop();
- } else {
- /*
- * If we want to do crash dump after notifier calls and
- * kmsg_dump, we will need architecture dependent extra
- * works in addition to stopping other CPUs.
- */
- crash_smp_send_stop();
- }
+ panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
+
+ printk_legacy_allow_panic_sync();
/*
* Run any panic handlers, including those that might need to
@@ -272,7 +518,9 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- kmsg_dump(KMSG_DUMP_PANIC);
+ sys_info(panic_print);
+
+ kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
/*
* If you doubt kdump always works fine in any situation,
@@ -286,9 +534,6 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
-#ifdef CONFIG_VT
- unblank_screen();
-#endif
console_unblank();
/*
@@ -302,7 +547,9 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info();
+ if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
+ panic_console_replay)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
if (!panic_blink)
panic_blink = no_blink;
@@ -349,6 +596,15 @@ void panic(const char *fmt, ...)
/* Do not scroll important messages printed above */
suppress_printk = 1;
+
+ /*
+ * The final messages may not have been printed if in a context that
+ * defers printing (such as NMI) and irq_work is not available.
+ * Explicitly flush the kernel log buffer one last time.
+ */
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+ nbcon_atomic_flush_unsafe();
+
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -359,34 +615,99 @@ void panic(const char *fmt, ...)
mdelay(PANIC_TIMER_STEP);
}
}
+EXPORT_SYMBOL(vpanic);
+/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
+void panic(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vpanic(fmt, args);
+ va_end(args);
+}
EXPORT_SYMBOL(panic);
+#define TAINT_FLAG(taint, _c_true, _c_false) \
+ [ TAINT_##taint ] = { \
+ .c_true = _c_true, .c_false = _c_false, \
+ .desc = #taint, \
+ }
+
/*
- * TAINT_FORCED_RMMOD could be a per-module flag but the module
- * is being removed anyway.
+ * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
+ * please also modify tools/debugging/kernel-chktaint and
+ * Documentation/admin-guide/tainted-kernels.rst, including its
+ * small shell script that prints the TAINT_FLAGS_COUNT bits of
+ * /proc/sys/kernel/tainted.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true },
- [ TAINT_FORCED_MODULE ] = { 'F', ' ', true },
- [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false },
- [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false },
- [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false },
- [ TAINT_BAD_PAGE ] = { 'B', ' ', false },
- [ TAINT_USER ] = { 'U', ' ', false },
- [ TAINT_DIE ] = { 'D', ' ', false },
- [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false },
- [ TAINT_WARN ] = { 'W', ' ', false },
- [ TAINT_CRAP ] = { 'C', ' ', true },
- [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false },
- [ TAINT_OOT_MODULE ] = { 'O', ' ', true },
- [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true },
- [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false },
- [ TAINT_LIVEPATCH ] = { 'K', ' ', true },
- [ TAINT_AUX ] = { 'X', ' ', true },
- [ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
+ TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'),
+ TAINT_FLAG(FORCED_MODULE, 'F', ' '),
+ TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '),
+ TAINT_FLAG(FORCED_RMMOD, 'R', ' '),
+ TAINT_FLAG(MACHINE_CHECK, 'M', ' '),
+ TAINT_FLAG(BAD_PAGE, 'B', ' '),
+ TAINT_FLAG(USER, 'U', ' '),
+ TAINT_FLAG(DIE, 'D', ' '),
+ TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '),
+ TAINT_FLAG(WARN, 'W', ' '),
+ TAINT_FLAG(CRAP, 'C', ' '),
+ TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '),
+ TAINT_FLAG(OOT_MODULE, 'O', ' '),
+ TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '),
+ TAINT_FLAG(SOFTLOCKUP, 'L', ' '),
+ TAINT_FLAG(LIVEPATCH, 'K', ' '),
+ TAINT_FLAG(AUX, 'X', ' '),
+ TAINT_FLAG(RANDSTRUCT, 'T', ' '),
+ TAINT_FLAG(TEST, 'N', ' '),
+ TAINT_FLAG(FWCTL, 'J', ' '),
};
+#undef TAINT_FLAG
+
+static void print_tainted_seq(struct seq_buf *s, bool verbose)
+{
+ const char *sep = "";
+ int i;
+
+ if (!tainted_mask) {
+ seq_buf_puts(s, "Not tainted");
+ return;
+ }
+
+ seq_buf_printf(s, "Tainted: ");
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ const struct taint_flag *t = &taint_flags[i];
+ bool is_set = test_bit(i, &tainted_mask);
+ char c = is_set ? t->c_true : t->c_false;
+
+ if (verbose) {
+ if (is_set) {
+ seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc);
+ sep = ", ";
+ }
+ } else {
+ seq_buf_putc(s, c);
+ }
+ }
+}
+
+static const char *_print_tainted(bool verbose)
+{
+ /* FIXME: what should the size be? */
+ static char buf[sizeof(taint_flags)];
+ struct seq_buf s;
+
+ BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
+ seq_buf_init(&s, buf, sizeof(buf));
+
+ print_tainted_seq(&s, verbose);
+
+ return seq_buf_str(&s);
+}
+
/**
* print_tainted - return a string to represent the kernel taint state.
*
@@ -397,25 +718,15 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
*/
const char *print_tainted(void)
{
- static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
-
- BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
-
- if (tainted_mask) {
- char *s;
- int i;
-
- s = buf + sprintf(buf, "Tainted: ");
- for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- const struct taint_flag *t = &taint_flags[i];
- *s++ = test_bit(i, &tainted_mask) ?
- t->c_true : t->c_false;
- }
- *s = 0;
- } else
- snprintf(buf, sizeof(buf), "Not tainted");
+ return _print_tainted(false);
+}
- return buf;
+/**
+ * print_tainted_verbose - A more verbose version of print_tainted()
+ */
+const char *print_tainted_verbose(void)
+{
+ return _print_tainted(true);
}
int test_taint(unsigned flag)
@@ -525,6 +836,7 @@ bool oops_may_print(void)
*/
void oops_enter(void)
{
+ nbcon_cpu_emergency_enter();
tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
@@ -547,6 +859,7 @@ void oops_exit(void)
{
do_oops_enter_exit();
print_oops_end_marker();
+ nbcon_cpu_emergency_exit();
kmsg_dump(KMSG_DUMP_OOPS);
}
@@ -558,34 +871,34 @@ struct warn_args {
void __warn(const char *file, int line, void *caller, unsigned taint,
struct pt_regs *regs, struct warn_args *args)
{
+ nbcon_cpu_emergency_enter();
+
disable_trace_on_warning();
- if (file)
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
- raw_smp_processor_id(), current->pid, file, line,
- caller);
- else
- pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
- raw_smp_processor_id(), current->pid, caller);
+ if (file) {
+ pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n",
+ file, line, caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ } else {
+ pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n",
+ caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ }
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
if (args)
vprintk(args->fmt, args->args);
+#pragma GCC diagnostic pop
print_modules();
if (regs)
show_regs(regs);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
- }
+ check_panic_on_warn("kernel");
if (!regs)
dump_stack();
@@ -597,12 +910,16 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
+
+ nbcon_cpu_emergency_exit();
}
+#ifdef CONFIG_BUG
#ifndef __WARN_FLAGS
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -610,6 +927,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
if (!fmt) {
__warn(file, line, __builtin_return_address(0), taint,
NULL, NULL);
+ warn_rcu_exit(rcu);
return;
}
@@ -617,11 +935,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -629,12 +949,11 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
-#ifdef CONFIG_BUG
-
/* Support resetting WARN*_ONCE state */
static int clear_warn_once_set(void *data, u64 val)
@@ -666,9 +985,15 @@ device_initcall(register_warn_debugfs);
*/
__visible noinstr void __stack_chk_fail(void)
{
+ unsigned long flags;
+
instrumentation_begin();
+ flags = user_access_save();
+
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+
+ user_access_restore(flags);
instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
@@ -676,10 +1001,28 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
core_param(panic, panic_timeout, int, 0644);
-core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
+core_param(panic_console_replay, panic_console_replay, bool, 0644);
+
+static int panic_print_set(const char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_set_ulong(val, kp);
+}
+
+static int panic_print_get(char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_get_ulong(val, kp);
+}
+
+static const struct kernel_param_ops panic_print_ops = {
+ .set = panic_print_set,
+ .get = panic_print_get,
+};
+__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);
static int __init oops_setup(char *s)
{
@@ -711,8 +1054,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 5b92310425c5..b96cfd693c99 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* Helpers for initial module or kernel cmdline parsing
- Copyright (C) 2001 Rusty Russell.
-
-*/
-#include <linux/kernel.h>
-#include <linux/string.h>
+/*
+ * Helpers for initial module or kernel cmdline parsing
+ * Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
+#include <linux/overflow.h>
#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/string.h>
#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
@@ -47,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
{
struct kmalloced_param *p;
- p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
@@ -119,9 +121,7 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
- void *arg,
- int (*handle_unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn handle_unknown)
{
unsigned int i;
int err;
@@ -164,9 +164,7 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- void *arg,
- int (*unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn unknown)
{
char *param, *val, *err = NULL;
@@ -263,17 +261,22 @@ EXPORT_SYMBOL_GPL(param_set_uint_minmax);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
- if (strlen(val) > 1024) {
+ size_t len, maxlen = 1024;
+
+ len = strnlen(val, maxlen + 1);
+ if (len == maxlen + 1) {
pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
maybe_kfree_parameter(*(char **)kp->arg);
- /* This is a hack. We can't kmalloc in early boot, and we
- * don't need to; this mangled commandline is preserved. */
+ /*
+ * This is a hack. We can't kmalloc() in early boot, and we
+ * don't need to; this mangled commandline is preserved.
+ */
if (slab_is_available()) {
- *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+ *(char **)kp->arg = kmalloc_parameter(len + 1);
if (!*(char **)kp->arg)
return -ENOMEM;
strcpy(*(char **)kp->arg, val);
@@ -310,7 +313,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
@@ -330,7 +333,7 @@ EXPORT_SYMBOL(param_ops_bool);
int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
{
- int err = 0;
+ int err;
bool new_value;
bool orig_value = *(bool *)kp->arg;
struct kernel_param dummy_kp = *kp;
@@ -510,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops);
int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
+ const size_t len = strnlen(val, kps->maxlen);
- if (strlen(val)+1 > kps->maxlen) {
+ if (len == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
- strcpy(kps->string, val);
+ memcpy(kps->string, val, len + 1);
return 0;
}
EXPORT_SYMBOL(param_set_copystring);
@@ -535,7 +539,7 @@ const struct kernel_param_ops param_ops_string = {
EXPORT_SYMBOL(param_ops_string);
/* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_attr(n) container_of_const(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
struct param_attribute
@@ -548,17 +552,17 @@ struct module_param_attrs
{
unsigned int num;
struct attribute_group grp;
- struct param_attribute attrs[];
+ struct param_attribute attrs[] __counted_by(num);
};
#ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
+#define to_param_attr(n) container_of_const(n, struct param_attribute, mattr)
-static ssize_t param_attr_show(struct module_attribute *mattr,
+static ssize_t param_attr_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
int count;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->get)
return -EPERM;
@@ -570,12 +574,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
}
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
-static ssize_t param_attr_store(struct module_attribute *mattr,
+static ssize_t param_attr_store(const struct module_attribute *mattr,
struct module_kobject *mk,
const char *buf, size_t len)
{
int err;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->set)
return -EPERM;
@@ -648,35 +652,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
}
/* Enlarge allocations. */
- new_mp = krealloc(mk->mp,
- sizeof(*mk->mp) +
- sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1),
GFP_KERNEL);
if (!new_mp)
return -ENOMEM;
mk->mp = new_mp;
+ mk->mp->num++;
/* Extra pointer for NULL terminator */
- new_attrs = krealloc(mk->mp->grp.attrs,
- sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
- GFP_KERNEL);
+ new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1,
+ sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL);
if (!new_attrs)
return -ENOMEM;
mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
- sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
- mk->mp->attrs[mk->mp->num].param = kp;
- mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr);
+ mk->mp->attrs[mk->mp->num - 1].param = kp;
+ mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show;
/* Do not allow runtime DAC changes to make param writable. */
if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
- mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store;
else
- mk->mp->attrs[mk->mp->num].mattr.store = NULL;
- mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
- mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
- mk->mp->num++;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm;
/* Fix up all the pointers, since krealloc can move us */
for (i = 0; i < mk->mp->num; i++)
@@ -742,8 +743,10 @@ void module_param_sysfs_remove(struct module *mod)
{
if (mod->mkobj.mp) {
sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
- /* We are positive that no one is using any param
- * attrs at this point. Deallocate immediately. */
+ /*
+ * We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately.
+ */
free_module_param_attrs(&mod->mkobj);
}
}
@@ -758,38 +761,35 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static struct module_kobject * __init locate_module_kobject(const char *name)
+struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
int err;
kobj = kset_find_obj(module_kset, name);
- if (kobj) {
- mk = to_module_kobject(kobj);
- } else {
- mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
- BUG_ON(!mk);
-
- mk->mod = THIS_MODULE;
- mk->kobj.kset = module_kset;
- err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
- "%s", name);
-#ifdef CONFIG_MODULES
- if (!err)
- err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
-#endif
- if (err) {
- kobject_put(&mk->kobj);
- pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
- name, err);
- return NULL;
- }
+ if (kobj)
+ return to_module_kobject(kobj);
+
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ if (!mk)
+ return NULL;
- /* So that we hold reference in both cases. */
- kobject_get(&mk->kobj);
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
+ if (IS_ENABLED(CONFIG_MODULES) && !err)
+ err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+ if (err) {
+ kobject_put(&mk->kobj);
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
+ name, err);
+ return NULL;
}
+ /* So that we hold reference in both cases. */
+ kobject_get(&mk->kobj);
+
return mk;
}
@@ -800,7 +800,7 @@ static void __init kernel_add_sysfs_param(const char *name,
struct module_kobject *mk;
int err;
- mk = locate_module_kobject(name);
+ mk = lookup_or_create_module_kobject(name);
if (!mk)
return;
@@ -842,21 +842,21 @@ static void __init param_sysfs_builtin(void)
dot = strchr(kp->name, '.');
if (!dot) {
/* This happens for core_param() */
- strcpy(modname, "kernel");
+ strscpy(modname, "kernel");
name_len = 0;
} else {
name_len = dot - kp->name + 1;
- strlcpy(modname, kp->name, name_len);
+ strscpy(modname, kp->name, name_len);
}
kernel_add_sysfs_param(modname, kp, name_len);
}
}
-ssize_t __modver_version_show(struct module_attribute *mattr,
+ssize_t __modver_version_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
- struct module_version_attribute *vattr =
- container_of(mattr, struct module_version_attribute, mattr);
+ const struct module_version_attribute *vattr =
+ container_of_const(mattr, struct module_version_attribute, mattr);
return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
@@ -871,7 +871,7 @@ static void __init version_sysfs_builtin(void)
int err;
for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
- mk = locate_module_kobject(vattr->module_name);
+ mk = lookup_or_create_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
WARN_ON_ONCE(err);
@@ -887,7 +887,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
@@ -906,7 +906,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
@@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
const struct kobj_type *ktype = get_ktype(kobj);
@@ -940,21 +940,26 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
.release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -964,13 +969,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
index 2fc0a16ec77b..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,8 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <linux/pidfs.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -59,12 +61,8 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
-#define RESERVED_PIDS 300
-
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -73,34 +71,21 @@ int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_pid_ns),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = PROC_PID_INIT_INO,
-#ifdef CONFIG_PID_NS
- .ns.ops = &pidns_operations,
+ .pid_max = PID_MAX_DEFAULT,
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@@ -111,6 +96,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
+ pidfs_free_pid(pid);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -125,11 +111,15 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
+ struct pid_namespace *active_ns;
+
+ lockdep_assert_not_held(&tasklist_lock);
+
+ active_ns = pid->numbers[pid->level].ns;
+ ns_ref_active_put(active_ns);
- spin_lock_irqsave(&pidmap_lock, flags);
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
@@ -151,11 +141,24 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ pidfs_remove_pid(pid);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
@@ -186,6 +189,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
+ int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@@ -206,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -233,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
@@ -266,24 +270,29 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
- spin_lock_irq(&pidmap_lock);
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
+ pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
+ ns_ref_active_get(ns);
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
put_pid_ns(ns);
out_free:
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -293,7 +302,7 @@ out_free:
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -301,9 +310,9 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -330,17 +339,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid *pid = *task_pid_ptr(task, type);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid **pid_ptr = task_pid_ptr(task, type);
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid_ptr = task_pid_ptr(task, type);
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
@@ -350,18 +365,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
@@ -372,6 +388,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+ lockdep_assert_held_write(&tasklist_lock);
+
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
@@ -388,8 +406,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- if (type == PIDTYPE_PID)
- new->thread_pid = old->thread_pid;
+ WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -474,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
struct upid *upid;
pid_t nr = 0;
- if (pid && ns->level <= pid->level) {
+ if (pid && ns && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
@@ -497,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
+ if (ns)
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
@@ -519,23 +538,21 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
+EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
- struct fd f;
+ CLASS(fd, f)(fd);
struct pid *pid;
- f = fdget(fd);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (!IS_ERR(pid)) {
get_pid(pid);
- *flags = f.file->f_flags;
+ *flags = fd_file(f)->f_flags;
}
-
- fdput(f);
return pid;
}
@@ -548,25 +565,34 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
- unsigned int f_flags;
+ unsigned int f_flags = 0;
struct pid *pid;
struct task_struct *task;
+ enum pid_type type;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
- return ERR_CAST(pid);
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, type);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
@@ -591,36 +617,28 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
-
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
-
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
+ int pidfd;
+ struct file *pidfd_file;
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
- * pidfd_open() - Open new pid file descriptor.
+ * sys_pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
@@ -630,7 +648,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;
if (pid <= 0)
@@ -646,23 +664,168 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
+{
+ return &task_active_pid_ns(current)->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &task_active_pid_ns(current)->set == set;
+}
+
+static int pid_table_root_permissions(struct ctl_table_header *head,
+ const struct ctl_table *table)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ int mode = table->mode;
+
+ if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
+ uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXU) >> 6;
+ else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXG) >> 3;
+ else
+ mode = mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void pid_table_root_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(pidns->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(pidns->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root pid_table_root = {
+ .lookup = pid_table_root_lookup,
+ .permissions = pid_table_root_permissions,
+ .set_ownership = pid_table_root_set_ownership,
+};
+
+static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp_pid;
+ int r;
+ struct ctl_table tmp_table = *table;
+
+ tmp_pid = pid_vnr(cad_pid);
+ tmp_table.data = &tmp_pid;
+
+ r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp_pid);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+static const struct ctl_table pid_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_do_cad_pid,
+ },
+#endif
+};
+#endif
+
+int register_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
+
+ tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
+ if (!tbl)
+ return -ENOMEM;
+ tbl->data = &pidns->pid_max;
+ pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+
+ pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
+ ARRAY_SIZE(pid_table));
+ if (!pidns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&pidns->set);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+void unregister_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+
+ tbl = pidns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(pidns->sysctls);
+ retire_sysctl_set(&pidns->set);
+ kfree(tbl);
+#endif
+}
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
- init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+ init_pid_ns.pid_cachep = kmem_cache_create("pid",
+ struct_size_t(struct pid, numbers, 1),
+ __alignof__(struct pid),
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+}
+
+static __init int pid_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ /* "kernel" directory will have already been initialized. */
+ BUG_ON(register_pidns_sysctls(&init_pid_ns));
+#endif
+ return 0;
}
+subsys_initcall(pid_namespace_sysctl_init);
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
@@ -680,7 +843,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
up_read(&task->signal->exec_update_lock);
- return file ?: ERR_PTR(-EBADF);
+ if (!file) {
+ /*
+ * It is possible that the target thread is exiting; it can be
+ * either:
+ * 1. before exit_signals(), which gives a real fd
+ * 2. before exit_files() takes the task_lock() gives a real fd
+ * 3. after exit_files() releases task_lock(), ->files is NULL;
+ * this has PF_EXITING, since it was set in exit_signals(),
+ * __pidfd_fget() returns EBADF.
+ * In case 3 we get EBADF, but that really means ESRCH, since
+ * the task is currently exiting and has freed its files
+ * struct, so we fix it up.
+ */
+ if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = ERR_PTR(-EBADF);
+ }
+
+ return file;
}
static int pidfd_getfd(struct pid *pid, int fd)
@@ -698,7 +880,7 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = receive_fd(file, O_CLOEXEC);
+ ret = receive_fd(file, NULL, O_CLOEXEC);
fput(file);
return ret;
@@ -724,23 +906,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
- struct fd f;
- int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
- f = fdget(pidfd);
- if (!f.file)
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (IS_ERR(pid))
- ret = PTR_ERR(pid);
- else
- ret = pidfd_getfd(pid, fd);
+ return PTR_ERR(pid);
- fdput(f);
- return ret;
+ return pidfd_getfd(pid, fd);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a46a3723bc66..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,9 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/nstree.h>
+#include <uapi/linux/wait.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -47,12 +50,12 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
return kc;
snprintf(name, sizeof(name), "pid_%u", level + 1);
- len = sizeof(struct pid) + level * sizeof(struct upid);
+ len = struct_size_t(struct pid, numbers, level + 1);
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
*pkc = kmem_cache_create(name, len, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
@@ -68,6 +71,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
+static void destroy_pid_namespace_work(struct work_struct *work);
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
@@ -98,20 +103,31 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto out_free_idr;
- ns->ns.ops = &pidns_operations;
- refcount_set(&ns->ns.count, 1);
+ ns->pid_max = PID_MAX_LIMIT;
+ err = register_pidns_sysctls(ns);
+ if (err)
+ goto out_free_inum;
+
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ INIT_WORK(&ns->work, destroy_pid_namespace_work);
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
+ ns_tree_add(ns);
return ns;
+out_free_inum:
+ ns_common_free(ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -133,13 +149,30 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- ns_free_inum(&ns->ns);
+ ns_tree_remove(ns);
+ unregister_pidns_sysctls(ns);
+
+ ns_common_free(ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
-struct pid_namespace *copy_pid_ns(unsigned long flags,
+static void destroy_pid_namespace_work(struct work_struct *work)
+{
+ struct pid_namespace *ns =
+ container_of(work, struct pid_namespace, work);
+
+ do {
+ struct pid_namespace *parent;
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+ ns = parent;
+ } while (ns != &init_pid_ns && ns_ref_put(ns));
+}
+
+struct pid_namespace *copy_pid_ns(u64 flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
@@ -151,15 +184,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
- struct pid_namespace *parent;
-
- while (ns != &init_pid_ns) {
- parent = ns->parent;
- if (!refcount_dec_and_test(&ns->ns.count))
- break;
- destroy_pid_namespace(ns);
- ns = parent;
- }
+ if (ns && ns_ref_put(ns))
+ schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -214,6 +240,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
do {
clear_thread_flag(TIF_SIGPENDING);
+ clear_thread_flag(TIF_NOTIFY_SIGNAL);
rc = kernel_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
@@ -256,7 +283,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
#ifdef CONFIG_CHECKPOINT_RESTORE
-static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
@@ -266,15 +293,10 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
- /*
- * Writing directly to ns' last_pid field is OK, since this field
- * is volatile in a living namespace anyway and a code writing to
- * it should synchronize its usage with external means.
- */
-
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
+ tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1);
@@ -282,19 +304,16 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
return ret;
}
-extern int pid_max;
-static struct ctl_table pid_ns_ctl_table[] = {
+static const struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
},
- { }
};
-static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
@@ -326,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -372,11 +386,23 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
+bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor)
+{
+ struct pid_namespace *ns;
+
+ if (child->level < ancestor->level)
+ return false;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return ns == ancestor;
+}
+
static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+ struct pid_namespace *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
@@ -390,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns)
* this maintains the property that processes and their
* children can not escape their current pid namespace.
*/
- if (new->level < active->level)
- return -EINVAL;
-
- ancestor = new;
- while (ancestor->level > active->level)
- ancestor = ancestor->parent;
- if (ancestor != active)
+ if (!pidns_is_ancestor(new, active))
return -EINVAL;
put_pid_ns(nsproxy->pid_ns_for_children);
@@ -429,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
const struct proc_ns_operations pidns_operations = {
.name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
@@ -440,7 +459,6 @@ const struct proc_ns_operations pidns_operations = {
const struct proc_ns_operations pidns_for_children_operations = {
.name = "pid_for_children",
.real_ns_name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_for_children_get,
.put = pidns_put,
.install = pidns_install,
@@ -453,8 +471,11 @@ static __init int pid_namespaces_init(void)
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
- register_sysctl_paths(kern_path, pid_ns_ctl_table);
+ register_sysctl_init("kernel", pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..5d8f981de7c5
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+ int err, scope, parent_scope;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
+
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
+}
+
+static const struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+};
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl("vm", pid_ns_ctl_table_vm);
+}
+#else
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a12779650f15..05337f437cca 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -39,9 +39,9 @@ config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on SWAP && ARCH_HIBERNATION_POSSIBLE
select HIBERNATE_CALLBACKS
- select LZO_COMPRESS
- select LZO_DECOMPRESS
select CRC32
+ select CRYPTO
+ select CRYPTO_LZO
help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -92,6 +92,28 @@ config HIBERNATION_SNAPSHOT_DEV
If in doubt, say Y.
+choice
+ prompt "Default compressor"
+ default HIBERNATION_COMP_LZO
+ depends on HIBERNATION
+
+config HIBERNATION_COMP_LZO
+ bool "lzo"
+ depends on CRYPTO_LZO
+
+config HIBERNATION_COMP_LZ4
+ bool "lz4"
+ depends on CRYPTO_LZ4
+
+endchoice
+
+config HIBERNATION_DEF_COMP
+ string
+ default "lzo" if HIBERNATION_COMP_LZO
+ default "lz4" if HIBERNATION_COMP_LZ4
+ help
+ Default compressor to be used for hibernation.
+
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
@@ -118,7 +140,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
@@ -143,6 +164,26 @@ config PM_AUTOSLEEP
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
+config PM_USERSPACE_AUTOSLEEP
+ bool "Userspace opportunistic sleep"
+ depends on PM_SLEEP
+ help
+ Notify kernel of aggressive userspace autosleep power management policy.
+
+ This option changes the behavior of various sleep-sensitive code to deal
+ with frequent userspace-initiated transitions into a global sleep state.
+
+ Saying Y here, disables code paths that most users really should keep
+ enabled. In particular, only enable this if it is very common to be
+ asleep/awake for very short periods of time (<= 2 seconds).
+
+ Only platforms, such as Android, that implement opportunistic sleep from
+ a userspace power manager service should enable this option; and not
+ other machines. Therefore, you should say N here, unless you are
+ extremely certain that this is what you want. The option otherwise has
+ bad, undesirable effects, and should not be enabled just for fun.
+
+
config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
@@ -161,6 +202,17 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
+config PM_QOS_CPU_SYSTEM_WAKEUP
+ bool "User space interface for CPU system wakeup QoS"
+ depends on CPU_IDLE
+ help
+ Enable this to allow user space via the cpu_wakeup_latency file to
+ specify a CPU system wakeup latency limit.
+
+ This may be particularly useful for platforms supporting multiple low
+ power states for CPUs during system-wide suspend and s2idle in
+ particular.
+
config PM
bool "Device power management core functionality"
help
@@ -216,11 +268,30 @@ config DPM_WATCHDOG
boot session.
config DPM_WATCHDOG_TIMEOUT
- int "Watchdog timeout in seconds"
+ int "Watchdog timeout to panic in seconds"
range 1 120
default 120
depends on DPM_WATCHDOG
+config DPM_WATCHDOG_WARNING_TIMEOUT
+ int "Watchdog timeout to warn in seconds"
+ range 1 DPM_WATCHDOG_TIMEOUT
+ default DPM_WATCHDOG_TIMEOUT
+ depends on DPM_WATCHDOG
+ help
+ If the DPM watchdog warning timeout and main timeout are
+ different then a non-fatal warning (with a stack trace of
+ the stuck suspend routine) will be printed when the warning
+ timeout expires. If the suspend routine gets un-stuck
+ before the main timeout expires then no other action is
+ taken. If the routine continues to be stuck and the main
+ timeout expires then an emergency-level message and stack
+ trace will be printed and the system will panic.
+
+ If the warning timeout is equal to the main timeout (the
+ default) then the warning will never happen and the system
+ will jump straight to panic when the main timeout expires.
+
config PM_TRACE
bool
help
@@ -320,8 +391,7 @@ config CPU_PM
config ENERGY_MODEL
bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
- depends on SMP
- depends on CPU_FREQ
+ depends on CPU_FREQ || PM_DEVFREQ
help
Several subsystems (thermal and/or the task scheduler for example)
can leverage information about the energy consumed by devices to
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..773e2789412b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
KASAN_SANITIZE_snapshot.o := n
@@ -17,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
-obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
+obj-$(CONFIG_ENERGY_MODEL) += em.o
+em-y := energy_model.o
+em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index b29c8aca7486..865df641b97c 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -9,7 +9,6 @@
#include <linux/device.h>
#include <linux/mutex.h>
-#include <linux/pm_wakeup.h>
#include "power.h"
diff --git a/kernel/power/console.c b/kernel/power/console.c
index fcdf0e14a47d..a906a0ac0f9b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -16,6 +16,7 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static bool vt_switch_done;
static DEFINE_MUTEX(vt_switch_mutex);
@@ -43,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list);
* no_console_suspend argument has been passed on the command line, VT
* switches will occur.
*/
-void pm_vt_switch_required(struct device *dev, bool required)
+int pm_vt_switch_required(struct device *dev, bool required)
{
struct pm_vt_switch *entry, *tmp;
+ int ret = 0;
mutex_lock(&vt_switch_mutex);
list_for_each_entry(tmp, &pm_vt_switch_list, head) {
@@ -57,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required)
}
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
+ if (!entry) {
+ ret = -ENOMEM;
goto out;
+ }
entry->required = required;
entry->dev = dev;
@@ -66,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required)
list_add(&entry->head, &pm_vt_switch_list);
out:
mutex_unlock(&vt_switch_mutex);
+ return ret;
}
EXPORT_SYMBOL(pm_vt_switch_required);
@@ -136,17 +141,21 @@ void pm_prepare_console(void)
if (orig_fgconsole < 0)
return;
+ vt_switch_done = true;
+
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
return;
}
void pm_restore_console(void)
{
- if (!pm_vt_switch())
+ if (!pm_vt_switch() && !vt_switch_done)
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
}
+
+ vt_switch_done = false;
}
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
new file mode 100644
index 000000000000..4b85da138a06
--- /dev/null
+++ b/kernel/power/em_netlink.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/energy_model.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <uapi/linux/energy_model.h>
+
+#include "em_netlink.h"
+#include "em_netlink_autogen.h"
+
+#define EM_A_PD_CPUS_LEN 256
+
+/*************************** Command encoding ********************************/
+static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ int *tot_msg_sz = data;
+ int msg_sz, cpus_sz;
+
+ cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+
+ msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */
+ nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */
+ nla_total_size(cpus_sz); /* EM_A_PD_CPUS */
+
+ *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz));
+ return 0;
+}
+
+static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ struct sk_buff *msg = data;
+ struct nlattr *entry;
+
+ entry = nla_nest_start(msg, EM_A_PDS_PD);
+ if (!entry)
+ goto out_cancel_nest;
+
+ if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id))
+ goto out_cancel_nest;
+
+ if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD))
+ goto out_cancel_nest;
+
+ snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+ if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf))
+ goto out_cancel_nest;
+
+ nla_nest_end(msg, entry);
+
+ return 0;
+
+out_cancel_nest:
+ nla_nest_cancel(msg, entry);
+
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int cmd = info->genlhdr->cmd;
+ int ret = -EMSGSIZE, msg_sz = 0;
+
+ for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = for_each_em_perf_domain(__em_nl_get_pd, msg);
+ if (ret)
+ goto out_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
+{
+ struct em_perf_domain *pd;
+ int id;
+
+ if (!attrs[EM_A_PD_TABLE_PD_ID])
+ return NULL;
+
+ id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]);
+ pd = em_perf_domain_get_by_id(id);
+ return pd;
+}
+
+static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd)
+{
+ int id_sz, ps_sz;
+
+ id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+ ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */
+ nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */
+ ps_sz *= pd->nr_perf_states;
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz));
+}
+
+static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
+{
+ struct em_perf_state *table, *ps;
+ struct nlattr *entry;
+ int i;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id))
+ goto out_err;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd((struct em_perf_domain *)pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ ps = &table[i];
+
+ entry = nla_nest_start(msg, EM_A_PD_TABLE_PS);
+ if (!entry)
+ goto out_unlock_ps;
+
+ if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE,
+ ps->performance, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY,
+ ps->frequency, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_POWER,
+ ps->power, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_COST,
+ ps->cost, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS,
+ ps->flags, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+
+ nla_nest_end(msg, entry);
+ }
+ rcu_read_unlock();
+ return 0;
+
+out_cancel_ps_nest:
+ nla_nest_cancel(msg, entry);
+out_unlock_ps:
+ rcu_read_unlock();
+out_err:
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ int cmd = info->genlhdr->cmd;
+ int msg_sz, ret = -EMSGSIZE;
+ struct em_perf_domain *pd;
+ struct sk_buff *msg;
+ void *hdr;
+
+ pd = __em_nl_get_pd_table_id(info->attrs);
+ if (!pd)
+ return -EINVAL;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_free_msg:
+ nlmsg_free(msg);
+ return ret;
+}
+
+
+/**************************** Event encoding *********************************/
+static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
+{
+ struct sk_buff *msg;
+ int msg_sz, ret = -EMSGSIZE;
+ void *hdr;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+void em_notify_pd_created(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_CREATED);
+}
+
+void em_notify_pd_updated(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_UPDATED);
+}
+
+static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd)
+{
+ int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz));
+}
+
+void em_notify_pd_deleted(const struct em_perf_domain *pd)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int msg_sz;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_notify_pd_deleted_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED);
+ if (!hdr)
+ goto out_free_msg;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) {
+ goto out_free_msg;
+ }
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+/**************************** Initialization *********************************/
+static int __init em_netlink_init(void)
+{
+ return genl_register_family(&em_nl_family);
+}
+postcore_initcall(em_netlink_init);
diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h
new file mode 100644
index 000000000000..583d7f1c3939
--- /dev/null
+++ b/kernel/power/em_netlink.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+#ifndef _EM_NETLINK_H
+#define _EM_NETLINK_H
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data);
+struct em_perf_domain *em_perf_domain_get_by_id(int id);
+void em_notify_pd_created(const struct em_perf_domain *pd);
+void em_notify_pd_deleted(const struct em_perf_domain *pd);
+void em_notify_pd_updated(const struct em_perf_domain *pd);
+#else
+static inline
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ return -EINVAL;
+}
+static inline
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ return NULL;
+}
+
+static inline void em_notify_pd_created(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {}
+#endif
+
+#endif /* _EM_NETLINK_H */
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
new file mode 100644
index 000000000000..a7a09ab1d1c2
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "em_netlink_autogen.h"
+
+#include <uapi/linux/energy_model.h>
+
+/* EM_CMD_GET_PD_TABLE - do */
+static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
+ [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+};
+
+/* Ops table for em */
+static const struct genl_split_ops em_nl_ops[] = {
+ {
+ .cmd = EM_CMD_GET_PDS,
+ .doit = em_nl_get_pds_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = EM_CMD_GET_PD_TABLE,
+ .doit = em_nl_get_pd_table_doit,
+ .policy = em_get_pd_table_nl_policy,
+ .maxattr = EM_A_PD_TABLE_PD_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group em_nl_mcgrps[] = {
+ [EM_NLGRP_EVENT] = { "event", },
+};
+
+struct genl_family em_nl_family __ro_after_init = {
+ .name = EM_FAMILY_NAME,
+ .version = EM_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = em_nl_ops,
+ .n_split_ops = ARRAY_SIZE(em_nl_ops),
+ .mcgrps = em_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps),
+};
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
new file mode 100644
index 000000000000..78ce609641f1
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_EM_GEN_H
+#define _LINUX_EM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/energy_model.h>
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ EM_NLGRP_EVENT,
+};
+
+extern struct genl_family em_nl_family;
+
+#endif /* _LINUX_EM_GEN_H */
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0153b0ca7b23..11af9f64aa82 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -17,12 +17,30 @@
#include <linux/sched/topology.h>
#include <linux/slab.h>
+#include "em_netlink.h"
+
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
+static void em_cpufreq_update_efficiencies(struct device *dev,
+ struct em_perf_state *table);
+static void em_check_capacity_update(void);
+static void em_update_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
+
static bool _is_cpu_device(struct device *dev)
{
return (dev->bus == &cpu_subsys);
@@ -31,19 +49,65 @@ static bool _is_cpu_device(struct device *dev)
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
+struct em_dbg_info {
+ struct em_perf_domain *pd;
+ int ps_id;
+};
+
+#define DEFINE_EM_DBG_SHOW(name, fname) \
+static int em_debug_##fname##_show(struct seq_file *s, void *unused) \
+{ \
+ struct em_dbg_info *em_dbg = s->private; \
+ struct em_perf_state *table; \
+ unsigned long val; \
+ \
+ rcu_read_lock(); \
+ table = em_perf_state_from_pd(em_dbg->pd); \
+ val = table[em_dbg->ps_id].name; \
+ rcu_read_unlock(); \
+ \
+ seq_printf(s, "%lu\n", val); \
+ return 0; \
+} \
+DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
+
+DEFINE_EM_DBG_SHOW(frequency, frequency);
+DEFINE_EM_DBG_SHOW(power, power);
+DEFINE_EM_DBG_SHOW(cost, cost);
+DEFINE_EM_DBG_SHOW(performance, performance);
+DEFINE_EM_DBG_SHOW(flags, inefficiency);
+
+static void em_debug_create_ps(struct em_perf_domain *em_pd,
+ struct em_dbg_info *em_dbg, int i,
+ struct dentry *pd)
{
+ struct em_perf_state *table;
+ unsigned long freq;
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
+ em_dbg[i].pd = em_pd;
+ em_dbg[i].ps_id = i;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(em_pd);
+ freq = table[i].frequency;
+ rcu_read_unlock();
+
+ snprintf(name, sizeof(name), "ps:%lu", freq);
/* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
- debugfs_create_ulong("power", 0444, d, &ps->power);
- debugfs_create_ulong("cost", 0444, d, &ps->cost);
- debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
+ debugfs_create_file("frequency", 0444, d, &em_dbg[i],
+ &em_debug_frequency_fops);
+ debugfs_create_file("power", 0444, d, &em_dbg[i],
+ &em_debug_power_fops);
+ debugfs_create_file("cost", 0444, d, &em_dbg[i],
+ &em_debug_cost_fops);
+ debugfs_create_file("performance", 0444, d, &em_dbg[i],
+ &em_debug_performance_fops);
+ debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
+ &em_debug_inefficiency_fops);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -54,31 +118,29 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static int em_debug_units_show(struct seq_file *s, void *unused)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
- "milliWatts" : "bogoWatts";
- seq_printf(s, "%s\n", units);
+ seq_printf(s, "%#lx\n", pd->flags);
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
-static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
+static int em_debug_id_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
- seq_printf(s, "%d\n", enabled);
+ seq_printf(s, "%d\n", pd->id);
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+DEFINE_SHOW_ATTRIBUTE(em_debug_id);
static void em_debug_create_pd(struct device *dev)
{
+ struct em_dbg_info *em_dbg;
struct dentry *d;
int i;
@@ -89,22 +151,25 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
&em_debug_cpus_fops);
- debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
- debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
- &em_debug_skip_inefficiencies_fops);
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
+
+ debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
+
+ em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
+ sizeof(*em_dbg), GFP_KERNEL);
+ if (!em_dbg)
+ return;
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
- em_debug_create_ps(&dev->em_pd->table[i], d);
+ em_debug_create_ps(dev->em_pd, em_dbg, i, d);
}
static void em_debug_remove_pd(struct device *dev)
{
- struct dentry *debug_dir;
-
- debug_dir = debugfs_lookup(dev_name(dev), rootdir);
- debugfs_remove_recursive(debug_dir);
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
}
static int __init em_debug_init(void)
@@ -120,17 +185,187 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
-static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb)
+static void em_release_table_kref(struct kref *kref)
{
- unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
- struct em_perf_state *table;
- int i, ret;
- u64 fmax;
+ /* It was the last owner of this table so we can free */
+ kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
+}
- table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
+/**
+ * em_table_free() - Handles safe free of the EM table when needed
+ * @table : EM table which is going to be freed
+ *
+ * No return values.
+ */
+void em_table_free(struct em_perf_table *table)
+{
+ kref_put(&table->kref, em_release_table_kref);
+}
+
+/**
+ * em_table_alloc() - Allocate a new EM table
+ * @pd : EM performance domain for which this must be done
+ *
+ * Allocate a new EM table and initialize its kref to indicate that it
+ * has a user.
+ * Returns allocated table or NULL.
+ */
+struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
+{
+ struct em_perf_table *table;
+ int table_size;
+
+ table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+
+ table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
if (!table)
- return -ENOMEM;
+ return NULL;
+
+ kref_init(&table->kref);
+
+ return table;
+}
+
+static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table, int nr_states)
+{
+ u64 fmax, max_cap;
+ int i, cpu;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return;
+
+ cpu = cpumask_first(em_span_cpus(pd));
+
+ /*
+ * Calculate the performance value for each frequency with
+ * linear relationship. The final CPU capacity might not be ready at
+ * boot time, but the EM will be updated a bit later with correct one.
+ */
+ fmax = (u64) table[nr_states - 1].frequency;
+ max_cap = (u64) arch_scale_cpu_capacity(cpu);
+ for (i = 0; i < nr_states; i++)
+ table[i].performance = div64_u64(max_cap * table[i].frequency,
+ fmax);
+}
+
+static int em_compute_costs(struct device *dev, struct em_perf_state *table,
+ const struct em_data_callback *cb, int nr_states,
+ unsigned long flags)
+{
+ unsigned long prev_cost = ULONG_MAX;
+ int i, ret;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return 0;
+
+ /* Compute the cost of each performance state. */
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ return -EINVAL;
+ }
+ } else {
+ /* increase resolution of 'cost' precision */
+ power_res = table[i].power * 10;
+ cost = power_res / table[i].performance;
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * em_dev_compute_costs() - Calculate cost values for new runtime EM table
+ * @dev : Device for which the EM table is to be updated
+ * @table : The new EM table that is going to get the costs calculated
+ * @nr_states : Number of performance states
+ *
+ * Calculate the em_perf_state::cost values for new runtime EM table. The
+ * values are used for EAS during task placement. It also calculates and sets
+ * the efficiency flag for each performance state. When the function finish
+ * successfully the EM table is ready to be updated and used by EAS.
+ *
+ * Return 0 on success or a proper error in case of failure.
+ */
+int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
+ int nr_states)
+{
+ return em_compute_costs(dev, table, NULL, nr_states, 0);
+}
+
+/**
+ * em_dev_update_perf_domain() - Update runtime EM table for a device
+ * @dev : Device for which the EM is to be updated
+ * @new_table : The new EM table that is going to be used from now
+ *
+ * Update EM runtime modifiable table for the @dev using the provided @table.
+ *
+ * This function uses a mutex to serialize writers, so it must not be called
+ * from a non-sleeping context.
+ *
+ * Return 0 on success or an error code on failure.
+ */
+int em_dev_update_perf_domain(struct device *dev,
+ struct em_perf_table *new_table)
+{
+ struct em_perf_table *old_table;
+ struct em_perf_domain *pd;
+
+ if (!dev)
+ return -EINVAL;
+
+ /* Serialize update/unregister or concurrent updates */
+ mutex_lock(&em_pd_mutex);
+
+ if (!dev->em_pd) {
+ mutex_unlock(&em_pd_mutex);
+ return -EINVAL;
+ }
+ pd = dev->em_pd;
+
+ kref_get(&new_table->kref);
+
+ old_table = rcu_dereference_protected(pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ rcu_assign_pointer(pd->em_table, new_table);
+
+ em_cpufreq_update_efficiencies(dev, new_table->state);
+
+ em_table_free(old_table);
+
+ mutex_unlock(&em_pd_mutex);
+
+ em_notify_pd_updated(pd);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table,
+ const struct em_data_callback *cb,
+ unsigned long flags)
+{
+ unsigned long power, freq, prev_freq = 0;
+ int nr_states = pd->nr_perf_states;
+ int i, ret;
/* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
@@ -139,11 +374,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, dev);
+ ret = cb->active_power(dev, &power, &freq);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
- goto free_ps_table;
+ return -EINVAL;
}
/*
@@ -153,57 +388,51 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (freq <= prev_freq) {
dev_err(dev, "EM: non-increasing freq: %lu\n",
freq);
- goto free_ps_table;
+ return -EINVAL;
}
/*
* The power returned by active_state() is expected to be
- * positive and to fit into 16 bits.
+ * positive and be in range.
*/
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
power);
- goto free_ps_table;
+ return -EINVAL;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
}
- /* Compute the cost of each performance state. */
- fmax = (u64) table[nr_states - 1].frequency;
- for (i = nr_states - 1; i >= 0; i--) {
- unsigned long power_res = em_scale_power(table[i].power);
+ em_init_performance(dev, pd, table, nr_states);
- table[i].cost = div64_u64(fmax * power_res,
- table[i].frequency);
- if (table[i].cost >= prev_cost) {
- table[i].flags = EM_PERF_STATE_INEFFICIENT;
- dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
- table[i].frequency);
- } else {
- prev_cost = table[i].cost;
- }
- }
-
- pd->table = table;
- pd->nr_perf_states = nr_states;
+ ret = em_compute_costs(dev, table, cb, nr_states, flags);
+ if (ret)
+ return -EINVAL;
return 0;
-
-free_ps_table:
- kfree(table);
- return -EINVAL;
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus)
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus,
+ unsigned long flags)
{
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
- int cpu, ret;
+ int cpu, ret, num_cpus, id;
if (_is_cpu_device(dev)) {
+ num_cpus = cpumask_weight(cpus);
+
+ /* Prevent max possible energy calculation to not overflow */
+ if (num_cpus > EM_MAX_NUM_CPUS) {
+ dev_err(dev, "EM: too many CPUs, overflow possible\n");
+ return -EINVAL;
+ }
+
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return -ENOMEM;
@@ -215,11 +444,24 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb);
- if (ret) {
- kfree(pd);
- return ret;
- }
+ pd->nr_perf_states = nr_states;
+
+ INIT_LIST_HEAD(&pd->node);
+
+ id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+ if (id < 0)
+ return -ENOMEM;
+ pd->id = id;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ goto free_pd;
+
+ ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
+ if (ret)
+ goto free_pd_table;
+
+ rcu_assign_pointer(pd->em_table, em_table);
if (_is_cpu_device(dev))
for_each_cpu(cpu, cpus) {
@@ -230,26 +472,38 @@ static int em_create_pd(struct device *dev, int nr_states,
dev->em_pd = pd;
return 0;
+
+free_pd_table:
+ kfree(em_table);
+free_pd:
+ kfree(pd);
+ ida_free(&em_pd_ida, id);
+ return -EINVAL;
}
-static void em_cpufreq_update_efficiencies(struct device *dev)
+static void
+em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
{
struct em_perf_domain *pd = dev->em_pd;
- struct em_perf_state *table;
struct cpufreq_policy *policy;
int found = 0;
- int i;
+ int i, cpu;
- if (!_is_cpu_device(dev) || !pd)
+ if (!_is_cpu_device(dev))
return;
- policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
- if (!policy) {
- dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ /* Try to get a CPU which is active and in this PD */
+ cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
return;
}
- table = pd->table;
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
+ return;
+ }
for (i = 0; i < pd->nr_perf_states; i++) {
if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
@@ -259,6 +513,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev)
found++;
}
+ cpufreq_cpu_put(policy);
+
if (!found)
return;
@@ -312,13 +568,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* @cpus : Pointer to cpumask_t, which in case of a CPU device is
* obligatory. It can be taken from i.e. 'policy->cpus'. For other
* type of devices this should be set to NULL.
- * @milliwatts : Flag indicating that the power values are in milliWatts or
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
* in some other scale. It must be set properly.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
* sub-systems might rely on this flag and check if all devices in the EM are
* using the same scale.
*
@@ -328,10 +584,36 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* Return 0 on success
*/
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus,
- bool milliwatts)
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
{
+ int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
+
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_register_pd_no_update() - Register a perf domain for a device
+ * @dev : Device to register the PD for
+ * @nr_states : Number of performance states in the new PD
+ * @cb : Callback functions for populating the energy model
+ * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
+ * @microwatts : Whether or not the power values in the EM will be in uW
+ *
+ * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
+ * update after registering the PD, even if @dev is a CPU device.
+ */
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
+{
+ struct em_perf_table *em_table;
unsigned long cap, prev_cap = 0;
+ unsigned long flags = 0;
int cpu, ret;
if (!dev || !nr_states || !cb)
@@ -378,23 +660,51 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
- ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (microwatts)
+ flags |= EM_PERF_DOMAIN_MICROWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ /*
+ * EM only supports uW (exception is artificial EM).
+ * Therefore, check and force the drivers to provide
+ * power in uW.
+ */
+ if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
+ dev_err(dev, "EM: only supports uW power values\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
- if (milliwatts)
- dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ dev->em_pd->flags |= flags;
+ dev->em_pd->min_perf_state = 0;
+ dev->em_pd->max_perf_state = nr_states - 1;
- em_cpufreq_update_efficiencies(dev);
+ em_table = rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ em_cpufreq_update_efficiencies(dev, em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
unlock:
mutex_unlock(&em_pd_mutex);
- return ret;
+ if (ret)
+ return ret;
+
+ mutex_lock(&em_pd_list_mutex);
+ list_add_tail(&dev->em_pd->node, &em_pd_list);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_created(dev->em_pd);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
/**
* em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
@@ -410,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev)
if (_is_cpu_device(dev))
return;
+ mutex_lock(&em_pd_list_mutex);
+ list_del_init(&dev->em_pd->node);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_deleted(dev->em_pd);
+
/*
* The mutex separates all register/unregister requests and protects
* from potential clean-up/setup issues in the debugfs directories.
@@ -418,9 +734,313 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- kfree(dev->em_pd->table);
+ em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex)));
+
+ ida_free(&em_pd_ida, dev->em_pd->id);
+
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
+
+static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_state *ps, *new_ps;
+ int ps_size;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ return NULL;
+
+ new_ps = em_table->state;
+
+ rcu_read_lock();
+ ps = em_perf_state_from_pd(pd);
+ /* Initialize data based on old table */
+ ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+ memcpy(new_ps, ps, ps_size);
+
+ rcu_read_unlock();
+
+ return em_table;
+}
+
+static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_table *em_table)
+{
+ int ret;
+
+ if (!em_is_artificial(pd)) {
+ ret = em_compute_costs(dev, em_table->state, NULL,
+ pd->nr_perf_states, pd->flags);
+ if (ret)
+ goto free_em_table;
+ }
+
+ ret = em_dev_update_perf_domain(dev, em_table);
+ if (ret)
+ goto free_em_table;
+
+ /*
+ * This is one-time-update, so give up the ownership in this updater.
+ * The EM framework has incremented the usage counter and from now
+ * will keep the reference (then free the memory when needed).
+ */
+free_em_table:
+ em_table_free(em_table);
+ return ret;
+}
+
+/*
+ * Adjustment of CPU performance values after boot, when all CPUs capacites
+ * are correctly calculated.
+ */
+static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
+ struct em_perf_domain *pd)
+{
+ unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
+ struct em_perf_table *em_table;
+ struct em_perf_state *table;
+ unsigned long em_max_perf;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ if (em_max_perf == cpu_capacity)
+ return;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
+ cpu_capacity, em_max_perf);
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return;
+ }
+
+ em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
+
+ em_recalc_and_update(dev, pd, em_table);
+}
+
+/**
+ * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
+ * @cpu: Target CPU.
+ *
+ * Adjust the existing EM for @cpu after a capacity update under the assumption
+ * that the capacity has been updated in the same way for all of the CPUs in
+ * the same perf domain.
+ */
+void em_adjust_cpu_capacity(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+ struct em_perf_domain *pd;
+
+ pd = em_pd_get(dev);
+ if (pd)
+ em_adjust_new_capacity(cpu, dev, pd);
+}
+
+static void em_check_capacity_update(void)
+{
+ cpumask_var_t cpu_done_mask;
+ int cpu, failed_cpus = 0;
+
+ if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
+ pr_warn("no free memory\n");
+ return;
+ }
+
+ /* Check if CPUs capacity has changed than update EM */
+ for_each_possible_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ struct em_perf_domain *pd;
+ struct device *dev;
+
+ if (cpumask_test_cpu(cpu, cpu_done_mask))
+ continue;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ failed_cpus++;
+ continue;
+ }
+ cpufreq_cpu_put(policy);
+
+ dev = get_cpu_device(cpu);
+ pd = em_pd_get(dev);
+ if (!pd || em_is_artificial(pd))
+ continue;
+
+ cpumask_or(cpu_done_mask, cpu_done_mask,
+ em_span_cpus(pd));
+
+ em_adjust_new_capacity(cpu, dev, pd);
+ }
+
+ if (failed_cpus)
+ schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000));
+
+ free_cpumask_var(cpu_done_mask);
+}
+
+static void em_update_workfn(struct work_struct *work)
+{
+ em_check_capacity_update();
+}
+
+/**
+ * em_dev_update_chip_binning() - Update Energy Model after the new voltage
+ * information is present in the OPPs.
+ * @dev : Device for which the Energy Model has to be updated.
+ *
+ * This function allows to update easily the EM with new values available in
+ * the OPP framework and DT. It can be used after the chip has been properly
+ * verified by device drivers and the voltages adjusted for the 'chip binning'.
+ */
+int em_dev_update_chip_binning(struct device *dev)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_domain *pd;
+ int i, ret;
+
+ if (IS_ERR_OR_NULL(dev))
+ return -EINVAL;
+
+ pd = em_pd_get(dev);
+ if (!pd) {
+ dev_warn(dev, "Couldn't find Energy Model\n");
+ return -EINVAL;
+ }
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /* Update power values which might change due to new voltage in OPPs */
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ unsigned long freq = em_table->state[i].frequency;
+ unsigned long power;
+
+ ret = dev_pm_opp_calc_power(dev, &power, &freq);
+ if (ret) {
+ em_table_free(em_table);
+ return ret;
+ }
+
+ em_table->state[i].power = power;
+ }
+
+ return em_recalc_and_update(dev, pd, em_table);
+}
+EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ * limits information.
+ * @pd : Performance Domain with EM that has to be updated.
+ * @freq_min_khz : New minimum allowed frequency for this device.
+ * @freq_max_khz : New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ struct em_perf_state *table;
+ int min_ps = -1;
+ int max_ps = -1;
+ int i;
+
+ if (!pd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (freq_min_khz == table[i].frequency)
+ min_ps = i;
+ if (freq_max_khz == table[i].frequency)
+ max_ps = i;
+ }
+ rcu_read_unlock();
+
+ /* Only update when both are found and sane */
+ if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+ return -EINVAL;
+
+
+ /* Guard simultaneous updates and make them atomic */
+ mutex_lock(&em_pd_mutex);
+ pd->min_perf_state = min_ps;
+ pd->max_perf_state = max_ps;
+ mutex_unlock(&em_pd_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ rebuild_sched_domains_energy();
+}
+
+void em_rebuild_sched_domains(void)
+{
+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+}
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ int ret;
+
+ ret = cb(pd, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ if (pd->id == id)
+ return pd;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..af8d07bafe02 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -11,6 +11,8 @@
#define pr_fmt(fmt) "PM: hibernation: " fmt
+#include <crypto/acompress.h>
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/reboot.h>
@@ -28,7 +30,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
#include <linux/secretmem.h>
@@ -47,6 +48,15 @@ dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
+static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP;
+
+/*
+ * Compression/decompression algorithm to be used while saving/loading
+ * image to/from disk. This would later be used in 'kernel/power/swap.c'
+ * to allocate comp streams.
+ */
+char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
@@ -70,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops;
static atomic_t hibernate_atomic = ATOMIC_INIT(1);
+#ifdef CONFIG_SUSPEND
+/**
+ * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend
+ */
+bool pm_hibernation_mode_is_suspend(void)
+{
+ return hibernation_mode == HIBERNATION_SUSPEND;
+}
+EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend);
+#endif
+
bool hibernate_acquire(void)
{
return atomic_add_unless(&hibernate_atomic, -1, 0);
@@ -80,11 +101,16 @@ void hibernate_release(void)
atomic_inc(&hibernate_atomic);
}
+bool hibernation_in_progress(void)
+{
+ return !atomic_read(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
return nohibernate == 0 &&
!security_locked_down(LOCKDOWN_HIBERNATION) &&
- !secretmem_active();
+ !secretmem_active() && !cxl_mem_active();
}
/**
@@ -93,20 +119,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -119,10 +149,15 @@ bool system_entering_hibernation(void)
EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from hibernation test");
static void hibernation_debug_sleep(void)
{
- pr_info("debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ pr_info("hibernation debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
}
static int hibernation_test(int level)
@@ -357,6 +392,23 @@ static int create_image(int platform_mode)
return error;
}
+static void shrink_shmem_memory(void)
+{
+ struct sysinfo info;
+ unsigned long nr_shmem_pages, nr_freed_pages;
+
+ si_meminfo(&info);
+ nr_shmem_pages = info.sharedram; /* current page count used for shmem */
+ /*
+ * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
+ * only reclaim about half of them, it's enough for creating the hibernation
+ * image.
+ */
+ nr_freed_pages = shrink_all_memory(nr_shmem_pages);
+ pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
+ nr_shmem_pages, nr_freed_pages);
+}
+
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
@@ -398,7 +450,16 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
- suspend_console();
+ /*
+ * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
+ * pages will use lots of system memory, causing hibernation image creation
+ * fail due to insufficient free memory.
+ * This call is to force flush the shmem pages to swap disk and reclaim
+ * the system memory so that image creation can succeed.
+ */
+ shrink_shmem_memory();
+
+ console_suspend_all();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -424,7 +485,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
dpm_complete(msg);
Close:
@@ -534,8 +595,7 @@ int hibernation_restore(int platform_mode)
int error;
pm_prepare_console();
- suspend_console();
- pm_restrict_gfp_mask();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
@@ -547,8 +607,7 @@ int hibernation_restore(int platform_mode)
BUG_ON(!error);
}
dpm_resume_end(PMSG_RECOVER);
- pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
pm_restore_console();
return error;
}
@@ -573,7 +632,7 @@ int hibernation_platform_enter(void)
goto Close;
entering_platform_hibernation = true;
- suspend_console();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -595,7 +654,11 @@ int hibernation_platform_enter(void)
local_irq_disable();
system_state = SYSTEM_SUSPEND;
- syscore_suspend();
+
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+
if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
@@ -607,6 +670,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ Enable_irqs:
system_state = SYSTEM_RUNNING;
local_irq_enable();
@@ -621,7 +685,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- resume_console();
+ console_resume_all();
Close:
hibernation_ops->end();
@@ -638,23 +702,16 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
-#ifdef CONFIG_SUSPEND
int error;
+#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- if (error) {
- hibernation_mode = hibernation_ops ?
- HIBERNATION_PLATFORM :
- HIBERNATION_SHUTDOWN;
- } else {
- /* Restore swap signature. */
- error = swsusp_unmark();
- if (error)
- pr_err("Swap will be unusable! Try swapon -a.\n");
+ error = suspend_devices_and_enter(mem_sleep_current);
+ if (!error)
+ goto exit;
- return;
- }
+ hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM :
+ HIBERNATION_SHUTDOWN;
}
#endif
@@ -663,11 +720,19 @@ static void power_down(void)
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
- hibernation_platform_enter();
+ error = hibernation_platform_enter();
+ if (error == -EAGAIN || error == -EBUSY) {
+ events_check_enabled = false;
+ pr_info("Wakeup event detected during hibernation, rolling back.\n");
+ goto exit;
+ }
fallthrough;
case HIBERNATION_SHUTDOWN:
- if (pm_power_off)
+ if (kernel_can_power_off()) {
+ entering_platform_hibernation = true;
kernel_power_off();
+ entering_platform_hibernation = false;
+ }
break;
}
kernel_halt();
@@ -678,6 +743,12 @@ static void power_down(void)
pr_crit("Power down manually\n");
while (1)
cpu_relax();
+
+exit:
+ /* Restore swap signature. */
+ error = swsusp_unmark();
+ if (error)
+ pr_err("Swap will be unusable! Try swapon -a.\n");
}
static int load_image_and_restore(void)
@@ -689,11 +760,13 @@ static int load_image_and_restore(void)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ swsusp_close();
goto Unlock;
+ }
error = swsusp_read(&flags);
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close();
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -706,12 +779,16 @@ static int load_image_and_restore(void)
return error;
}
+#define COMPRESSION_ALGO_LZO "lzo"
+#define COMPRESSION_ALGO_LZ4 "lz4"
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
bool snapshot_test = false;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available()) {
@@ -719,7 +796,18 @@ int hibernate(void)
return -EPERM;
}
- lock_system_sleep();
+ /*
+ * Query for the compression algorithm support if compression is enabled.
+ */
+ if (!nocompress) {
+ strscpy(hib_comp_algo, hibernate_compressor);
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -732,7 +820,11 @@ int hibernate(void)
if (error)
goto Restore;
- ksys_sync_helper();
+ error = pm_sleep_fs_sync();
+ if (error)
+ goto Notify;
+
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -753,11 +845,24 @@ int hibernate(void)
if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE;
- if (nocompress)
+ if (nocompress) {
flags |= SF_NOCOMPRESS_MODE;
- else
+ } else {
flags |= SF_CRC32_MODE;
+ /*
+ * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4
+ * to override this behaviour and use LZ4.
+ *
+ * Refer kernel/power/power.h for more details
+ */
+
+ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4))
+ flags |= SF_COMPRESSION_ALG_LZ4;
+ else
+ flags |= SF_COMPRESSION_ALG_LZO;
+ }
+
pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
@@ -779,7 +884,7 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check();
+ error = swsusp_check(false);
if (!error)
error = load_image_and_restore();
}
@@ -788,12 +893,14 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
+ filesystems_thaw();
+ Notify:
pm_notifier_call_chain(PM_POST_HIBERNATION);
Restore:
pm_restore_console();
hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
pr_info("hibernation exit\n");
return error;
@@ -808,9 +915,10 @@ int hibernate(void)
*/
int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
+ unsigned int sleep_flags;
int error;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -823,6 +931,8 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto restore;
+ filesystems_freeze(filesystem_freeze_enabled);
+
error = freeze_processes();
if (error)
goto exit;
@@ -843,7 +953,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto dpm_complete;
- suspend_console();
+ console_suspend_all();
error = dpm_suspend(PMSG_FREEZE);
if (error)
@@ -867,7 +977,7 @@ skip:
dpm_resume:
dpm_resume(PMSG_THAW);
- resume_console();
+ console_resume_all();
dpm_complete:
dpm_complete(PMSG_THAW);
@@ -882,6 +992,7 @@ thaw:
thaw_processes();
exit:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_HIBERNATION);
restore:
@@ -890,56 +1001,16 @@ restore:
hibernate_release();
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
-/**
- * software_resume - Resume from a saved hibernation image.
- *
- * This routine is called as a late initcall, when all devices have been
- * discovered and initialized already.
- *
- * The image reading code is called to see if there is a hibernation image
- * available for reading. If that is the case, devices are quiesced and the
- * contents of memory is restored from the saved image.
- *
- * If this is successful, control reappears in the restored target kernel in
- * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
- * attempts to recover gracefully and make the kernel return to the normal mode
- * of operation.
- */
-static int software_resume(void)
+static int __init find_resume_device(void)
{
- int error;
-
- /*
- * If the user said "noresume".. bail out early.
- */
- if (noresume || !hibernation_available())
- return 0;
-
- /*
- * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
- * is configured into the kernel. Since the regular hibernate
- * trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take system_transition_mutex)
- * this can cause lockdep to complain about a possible ABBA deadlock
- * which cannot happen since we're in the boot code here and
- * sysfs can't be invoked yet. Therefore, we use a subclass
- * here to avoid lockdep complaining.
- */
- mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
-
- if (swsusp_resume_device)
- goto Check_image;
-
- if (!strlen(resume_file)) {
- error = -ENOENT;
- goto Unlock;
- }
+ if (!strlen(resume_file))
+ return -ENOENT;
pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
@@ -950,40 +1021,57 @@ static int software_resume(void)
}
/* Check if the device is there */
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- /*
- * Some device discovery might still be in progress; we need
- * to wait for this to finish.
- */
- wait_for_device_probe();
-
- if (resume_wait) {
- while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
- msleep(10);
- async_synchronize_full();
- }
+ if (!early_lookup_bdev(resume_file, &swsusp_resume_device))
+ return 0;
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- error = -ENODEV;
- goto Unlock;
- }
+ /*
+ * Some device discovery might still be in progress; we need to wait for
+ * this to finish.
+ */
+ wait_for_device_probe();
+ if (resume_wait) {
+ while (early_lookup_bdev(resume_file, &swsusp_resume_device))
+ msleep(10);
+ async_synchronize_full();
}
- Check_image:
+ return early_lookup_bdev(resume_file, &swsusp_resume_device);
+}
+
+static int software_resume(void)
+{
+ int error;
+
pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
pm_pr_dbg("Looking for hibernation image.\n");
- error = swsusp_check();
+
+ mutex_lock(&system_transition_mutex);
+ error = swsusp_check(true);
if (error)
goto Unlock;
+ /*
+ * Check if the hibernation image is compressed. If so, query for
+ * the algorithm support.
+ */
+ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
+ if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4);
+ else
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO);
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ error = -EOPNOTSUPP;
+ goto Unlock;
+ }
+ }
+
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close();
goto Unlock;
}
@@ -993,19 +1081,25 @@ static int software_resume(void)
if (error)
goto Restore;
+ filesystems_freeze(filesystem_freeze_enabled);
+
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
- if (error)
+ if (error) {
+ filesystems_thaw();
goto Close_Finish;
+ }
error = freeze_kernel_threads();
if (error) {
thaw_processes();
+ filesystems_thaw();
goto Close_Finish;
}
error = load_image_and_restore();
thaw_processes();
+ filesystems_thaw();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
Restore:
@@ -1018,11 +1112,43 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close();
goto Finish;
}
-late_initcall_sync(software_resume);
+/**
+ * software_resume_initcall - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+static int __init software_resume_initcall(void)
+{
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ if (!swsusp_resume_device) {
+ int error = find_resume_device();
+
+ if (error)
+ return error;
+ }
+
+ return software_resume();
+}
+late_initcall_sync(software_resume_initcall);
static const char * const hibernation_modes[] = {
@@ -1064,11 +1190,11 @@ static const char * const hibernation_modes[] = {
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
+ ssize_t count = 0;
int i;
- char *start = buf;
if (!hibernation_available())
- return sprintf(buf, "[disabled]\n");
+ return sysfs_emit(buf, "[disabled]\n");
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
@@ -1088,22 +1214,27 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
continue;
}
if (i == hibernation_mode)
- buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]);
}
- buf += sprintf(buf, "\n");
- return buf-start;
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -1111,7 +1242,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -1141,7 +1272,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!error)
pm_pr_dbg("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -1150,16 +1281,21 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
- MINOR(swsusp_resume_device));
+ return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t dev;
+ int error;
+
+ if (!hibernation_available())
+ return n;
if (len && buf[len-1] == '\n')
len--;
@@ -1167,14 +1303,32 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!name)
return -ENOMEM;
- res = name_to_dev_t(name);
+ error = lookup_bdev(name, &dev);
+ if (error) {
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ error = 0;
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset,
+ &dummy) == 3) {
+ dev = MKDEV(maj, min);
+ if (maj != MAJOR(dev) || min != MINOR(dev))
+ error = -EINVAL;
+ } else {
+ dev = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ error = -EINVAL;
+ }
+ }
kfree(name);
- if (!res)
- return -EINVAL;
+ if (error)
+ return error;
+
+ sleep_flags = lock_system_sleep();
+ swsusp_resume_device = dev;
+ unlock_system_sleep(sleep_flags);
- lock_system_sleep();
- swsusp_resume_device = res;
- unlock_system_sleep();
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
swsusp_resume_device);
noresume = 0;
@@ -1187,7 +1341,7 @@ power_attr(resume);
static ssize_t resume_offset_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
}
static ssize_t resume_offset_store(struct kobject *kobj,
@@ -1210,7 +1364,7 @@ power_attr(resume_offset);
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", image_size);
+ return sysfs_emit(buf, "%lu\n", image_size);
}
static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1231,7 +1385,7 @@ power_attr(image_size);
static ssize_t reserved_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", reserved_size);
+ return sysfs_emit(buf, "%lu\n", reserved_size);
}
static ssize_t reserved_size_store(struct kobject *kobj,
@@ -1278,7 +1432,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy(resume_file, str, 255);
+ strscpy(resume_file, str);
return 1;
}
@@ -1328,7 +1482,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
@@ -1339,6 +1493,56 @@ static int __init nohibernate_setup(char *str)
return 1;
}
+static const char * const comp_alg_enabled[] = {
+#if IS_ENABLED(CONFIG_CRYPTO_LZO)
+ COMPRESSION_ALGO_LZO,
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+ COMPRESSION_ALGO_LZ4,
+#endif
+};
+
+static int hibernate_compressor_param_set(const char *compressor,
+ const struct kernel_param *kp)
+{
+ int index, ret;
+
+ if (!mutex_trylock(&system_transition_mutex))
+ return -EBUSY;
+
+ index = sysfs_match_string(comp_alg_enabled, compressor);
+ if (index >= 0) {
+ ret = param_set_copystring(comp_alg_enabled[index], kp);
+ if (!ret)
+ strscpy(hib_comp_algo, comp_alg_enabled[index]);
+ } else {
+ ret = index;
+ }
+
+ mutex_unlock(&system_transition_mutex);
+
+ if (ret)
+ pr_debug("Cannot set specified compressor %s\n",
+ compressor);
+
+ return ret;
+}
+
+static const struct kernel_param_ops hibernate_compressor_param_ops = {
+ .set = hibernate_compressor_param_set,
+ .get = param_get_string,
+};
+
+static struct kparam_string hibernate_compressor_param_string = {
+ .maxlen = sizeof(hibernate_compressor),
+ .string = hibernate_compressor,
+};
+
+module_param_cb(compressor, &hibernate_compressor_param_ops,
+ &hibernate_compressor_param_string, 0644);
+MODULE_PARM_DESC(compressor,
+ "Compression algorithm to be used with hibernation");
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 44169f3081fd..03b2c5495c77 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,7 +6,9 @@
* Copyright (c) 2003 Open Source Development Lab
*/
+#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/pm-trace.h>
@@ -16,37 +18,65 @@
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/pm_runtime.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
#include "power.h"
#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+static unsigned int saved_gfp_count;
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+
+ if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ return;
+
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+
+ pm_pr_dbg("GFP mask restored\n");
+}
-void lock_system_sleep(void)
+void pm_restrict_gfp_mask(void)
{
- current->flags |= PF_FREEZER_SKIP;
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+
+ if (saved_gfp_count++) {
+ WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask);
+ return;
+ }
+
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+
+ pm_pr_dbg("GFP mask restricted\n");
+}
+
+unsigned int lock_system_sleep(void)
+{
+ unsigned int flags = current->flags;
+ current->flags |= PF_NOFREEZE;
mutex_lock(&system_transition_mutex);
+ return flags;
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
-void unlock_system_sleep(void)
+void unlock_system_sleep(unsigned int flags)
{
- /*
- * Don't use freezer_count() because we don't want the call to
- * try_to_freeze() here.
- *
- * Reason:
- * Fundamentally, we just don't need it, because freezing condition
- * doesn't come into effect until we release the
- * system_transition_mutex lock, since the freezer always works with
- * system_transition_mutex held.
- *
- * More importantly, in the case of hibernation,
- * unlock_system_sleep() gets called in snapshot_read() and
- * snapshot_write() when the freezing condition is still in effect.
- * Which means, if we use try_to_freeze() here, it would make them
- * enter the refrigerator, thus causing hibernation to lockup.
- */
- current->flags &= ~PF_FREEZER_SKIP;
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -64,6 +94,61 @@ void ksys_sync_helper(void)
}
EXPORT_SYMBOL_GPL(ksys_sync_helper);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+/* Wakeup events handling resolution while syncing file systems in jiffies */
+#define PM_FS_SYNC_WAKEUP_RESOLUTION 5
+
+static atomic_t pm_fs_sync_count = ATOMIC_INIT(0);
+static struct workqueue_struct *pm_fs_sync_wq;
+static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait);
+
+static bool pm_fs_sync_completed(void)
+{
+ return atomic_read(&pm_fs_sync_count) == 0;
+}
+
+static void pm_fs_sync_work_fn(struct work_struct *work)
+{
+ ksys_sync_helper();
+
+ if (atomic_dec_and_test(&pm_fs_sync_count))
+ wake_up(&pm_fs_sync_wait);
+}
+static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn);
+
+/**
+ * pm_sleep_fs_sync() - Sync file systems in an interruptible way
+ *
+ * Return: 0 on successful file system sync, or -EBUSY if the file system sync
+ * was aborted.
+ */
+int pm_sleep_fs_sync(void)
+{
+ pm_wakeup_clear(0);
+
+ /*
+ * Take back-to-back sleeps into account by queuing a subsequent fs sync
+ * only if the previous fs sync is running or is not queued. Multiple fs
+ * syncs increase the likelihood of saving the latest files immediately
+ * before sleep.
+ */
+ if (!work_pending(&pm_fs_sync_work)) {
+ atomic_inc(&pm_fs_sync_count);
+ queue_work(pm_fs_sync_wq, &pm_fs_sync_work);
+ }
+
+ while (!pm_fs_sync_completed()) {
+ if (pm_wakeup_pending())
+ return -EBUSY;
+
+ wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(),
+ PM_FS_SYNC_WAKEUP_RESOLUTION);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
@@ -97,10 +182,18 @@ int pm_notifier_call_chain(unsigned long val)
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
+static int __init pm_async_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ pm_async_enabled = 0;
+ return 1;
+}
+__setup("pm_async=", pm_async_setup);
+
static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_async_enabled);
+ return sysfs_emit(buf, "%d\n", pm_async_enabled);
}
static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -124,24 +217,27 @@ power_attr(pm_async);
static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
suspend_state_t i;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
if (mem_sleep_current == i)
- s += sprintf(s, "[%s] ", label);
+ count += sysfs_emit_at(buf, count, "[%s] ", label);
else
- s += sprintf(s, "%s ", label);
+ count += sysfs_emit_at(buf, count, "%s ", label);
}
+ }
/* Convert the last space to a newline if needed. */
- if (s != buf)
- *(s-1) = '\n';
+ if (count > 0)
+ buf[count - 1] = '\n';
- return (s - buf);
+ return count;
}
static suspend_state_t decode_suspend_state(const char *buf, size_t n)
@@ -192,17 +288,17 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
/*
- * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ * sync_on_suspend: Sync file systems before suspend.
*
- * show() returns whether ksys_sync_helper() is invoked before suspend.
- * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it.
+ * show() returns whether file systems sync before suspend is enabled.
+ * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it.
*/
bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
static ssize_t sync_on_suspend_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", sync_on_suspend_enabled);
+ return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled);
}
static ssize_t sync_on_suspend_store(struct kobject *kobj,
@@ -239,37 +335,38 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = {
static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
int level;
for (level = TEST_FIRST; level <= TEST_MAX; level++)
if (pm_tests[level]) {
if (level == pm_test_level)
- s += sprintf(s, "[%s] ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]);
else
- s += sprintf(s, "%s ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]);
}
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
- return (s - buf);
+ return count;
}
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -279,7 +376,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -287,44 +384,117 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP_DEBUG */
-static char *suspend_step_name(enum suspend_stat_step step)
-{
- switch (step) {
- case SUSPEND_FREEZE:
- return "freeze";
- case SUSPEND_PREPARE:
- return "prepare";
- case SUSPEND_SUSPEND:
- return "suspend";
- case SUSPEND_SUSPEND_NOIRQ:
- return "suspend_noirq";
- case SUSPEND_RESUME_NOIRQ:
- return "resume_noirq";
- case SUSPEND_RESUME:
- return "resume";
- default:
- return "";
+#define SUSPEND_NR_STEPS SUSPEND_RESUME
+#define REC_FAILED_NUM 2
+
+struct suspend_stats {
+ unsigned int step_failures[SUSPEND_NR_STEPS];
+ unsigned int success;
+ unsigned int fail;
+ int last_failed_dev;
+ char failed_devs[REC_FAILED_NUM][40];
+ int last_failed_errno;
+ int errno[REC_FAILED_NUM];
+ int last_failed_step;
+ u64 last_hw_sleep;
+ u64 total_hw_sleep;
+ u64 max_hw_sleep;
+ enum suspend_stat_step failed_steps[REC_FAILED_NUM];
+};
+
+static struct suspend_stats suspend_stats;
+static DEFINE_MUTEX(suspend_stats_lock);
+
+void dpm_save_failed_dev(const char *name)
+{
+ mutex_lock(&suspend_stats_lock);
+
+ strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+ name, sizeof(suspend_stats.failed_devs[0]));
+ suspend_stats.last_failed_dev++;
+ suspend_stats.last_failed_dev %= REC_FAILED_NUM;
+
+ mutex_unlock(&suspend_stats_lock);
+}
+
+void dpm_save_failed_step(enum suspend_stat_step step)
+{
+ suspend_stats.step_failures[step-1]++;
+ suspend_stats.failed_steps[suspend_stats.last_failed_step] = step;
+ suspend_stats.last_failed_step++;
+ suspend_stats.last_failed_step %= REC_FAILED_NUM;
+}
+
+void dpm_save_errno(int err)
+{
+ if (!err) {
+ suspend_stats.success++;
+ return;
}
+
+ suspend_stats.fail++;
+
+ suspend_stats.errno[suspend_stats.last_failed_errno] = err;
+ suspend_stats.last_failed_errno++;
+ suspend_stats.last_failed_errno %= REC_FAILED_NUM;
+}
+
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
+static const char * const suspend_step_names[] = {
+ [SUSPEND_WORKING] = "",
+ [SUSPEND_FREEZE] = "freeze",
+ [SUSPEND_PREPARE] = "prepare",
+ [SUSPEND_SUSPEND] = "suspend",
+ [SUSPEND_SUSPEND_LATE] = "suspend_late",
+ [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq",
+ [SUSPEND_RESUME_NOIRQ] = "resume_noirq",
+ [SUSPEND_RESUME_EARLY] = "resume_early",
+ [SUSPEND_RESUME] = "resume",
+};
+
+#define suspend_attr(_name, format_str) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sysfs_emit(buf, format_str, suspend_stats._name);\
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_attr(success, "%u\n");
+suspend_attr(fail, "%u\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
-#define suspend_attr(_name) \
+#define suspend_step_attr(_name, step) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%d\n", suspend_stats._name); \
+ return sysfs_emit(buf, "%u\n", \
+ suspend_stats.step_failures[step-1]); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_step_attr(failed_freeze, SUSPEND_FREEZE);
+suspend_step_attr(failed_prepare, SUSPEND_PREPARE);
+suspend_step_attr(failed_suspend, SUSPEND_SUSPEND);
+suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE);
+suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ);
+suspend_step_attr(failed_resume, SUSPEND_RESUME);
+suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY);
+suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ);
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -336,7 +506,7 @@ static ssize_t last_failed_dev_show(struct kobject *kobj,
index %= REC_FAILED_NUM;
last_failed_dev = suspend_stats.failed_devs[index];
- return sprintf(buf, "%s\n", last_failed_dev);
+ return sysfs_emit(buf, "%s\n", last_failed_dev);
}
static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev);
@@ -350,23 +520,21 @@ static ssize_t last_failed_errno_show(struct kobject *kobj,
index %= REC_FAILED_NUM;
last_failed_errno = suspend_stats.errno[index];
- return sprintf(buf, "%d\n", last_failed_errno);
+ return sysfs_emit(buf, "%d\n", last_failed_errno);
}
static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
static ssize_t last_failed_step_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int index;
enum suspend_stat_step step;
- char *last_failed_step = NULL;
+ int index;
index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
index %= REC_FAILED_NUM;
step = suspend_stats.failed_steps[index];
- last_failed_step = suspend_step_name(step);
- return sprintf(buf, "%s\n", last_failed_step);
+ return sysfs_emit(buf, "%s\n", suspend_step_names[step]);
}
static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
@@ -384,18 +552,37 @@ static struct attribute *suspend_attrs[] = {
&last_failed_dev.attr,
&last_failed_errno.attr,
&last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
NULL,
};
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
};
#ifdef CONFIG_DEBUG_FS
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
+ enum suspend_stat_step step;
last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
last_dev %= REC_FAILED_NUM;
@@ -403,47 +590,35 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
last_errno %= REC_FAILED_NUM;
last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
last_step %= REC_FAILED_NUM;
- seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
- "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
- "success", suspend_stats.success,
- "fail", suspend_stats.fail,
- "failed_freeze", suspend_stats.failed_freeze,
- "failed_prepare", suspend_stats.failed_prepare,
- "failed_suspend", suspend_stats.failed_suspend,
- "failed_suspend_late",
- suspend_stats.failed_suspend_late,
- "failed_suspend_noirq",
- suspend_stats.failed_suspend_noirq,
- "failed_resume", suspend_stats.failed_resume,
- "failed_resume_early",
- suspend_stats.failed_resume_early,
- "failed_resume_noirq",
- suspend_stats.failed_resume_noirq);
+
+ seq_printf(s, "success: %u\nfail: %u\n",
+ suspend_stats.success, suspend_stats.fail);
+
+ for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++)
+ seq_printf(s, "failed_%s: %u\n", suspend_step_names[step],
+ suspend_stats.step_failures[step-1]);
+
seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
- suspend_stats.failed_devs[last_dev]);
+ suspend_stats.failed_devs[last_dev]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_dev + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-s\n",
- suspend_stats.failed_devs[index]);
+ seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]);
}
seq_printf(s, " last_failed_errno:\t%-d\n",
suspend_stats.errno[last_errno]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_errno + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-d\n",
- suspend_stats.errno[index]);
+ seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]);
}
seq_printf(s, " last_failed_step:\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[last_step]));
+ suspend_step_names[suspend_stats.failed_steps[last_step]]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_step + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
seq_printf(s, "\t\t\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[index]));
+ suspend_step_names[suspend_stats.failed_steps[index]]);
}
return 0;
@@ -460,6 +635,10 @@ static int __init pm_debugfs_init(void)
late_initcall(pm_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
+bool pm_sleep_transition_in_progress(void)
+{
+ return pm_suspend_in_progress() || hibernation_in_progress();
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -474,7 +653,7 @@ bool pm_print_times_enabled;
static ssize_t pm_print_times_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", pm_print_times_enabled);
+ return sysfs_emit(buf, "%d\n", pm_print_times_enabled);
}
static ssize_t pm_print_times_store(struct kobject *kobj,
@@ -497,24 +676,33 @@ power_attr(pm_print_times);
static inline void pm_print_times_init(void)
{
- pm_print_times_enabled = !!initcall_debug;
+ pm_print_times_enabled = initcall_debug;
}
static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sysfs_emit(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
bool pm_debug_messages_on __read_mostly;
+bool pm_debug_messages_should_print(void)
+{
+ return pm_debug_messages_on && pm_sleep_transition_in_progress();
+}
+EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
+
static ssize_t pm_debug_messages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", pm_debug_messages_on);
+ return sysfs_emit(buf, "%d\n", pm_debug_messages_on);
}
static ssize_t pm_debug_messages_store(struct kobject *kobj,
@@ -542,35 +730,6 @@ static int __init pm_debug_messages_setup(char *str)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -591,21 +750,23 @@ struct kobject *power_kobj;
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
#ifdef CONFIG_SUSPEND
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (pm_states[i])
- s += sprintf(s,"%s ", pm_states[i]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_states[i]);
#endif
if (hibernation_available())
- s += sprintf(s, "disk ");
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
- return (s - buf);
+ count += sysfs_emit_at(buf, count, "disk ");
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static suspend_state_t decode_state(const char *buf, size_t n)
@@ -705,7 +866,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
unsigned int val;
return pm_get_wakeup_count(&val, true) ?
- sprintf(buf, "%u\n", val) : -EINTR;
+ sysfs_emit(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -747,17 +908,17 @@ static ssize_t autosleep_show(struct kobject *kobj,
suspend_state_t state = pm_autosleep_state();
if (state == PM_SUSPEND_ON)
- return sprintf(buf, "off\n");
+ return sysfs_emit(buf, "off\n");
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state] ?
+ return sysfs_emit(buf, "%s\n", pm_states[state] ?
pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
- return sprintf(buf, "disk\n");
+ return sysfs_emit(buf, "disk\n");
#else
- return sprintf(buf, "error");
+ return sysfs_emit(buf, "error\n");
#endif
}
@@ -826,7 +987,7 @@ int pm_trace_enabled;
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_trace_enabled);
+ return sysfs_emit(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
@@ -863,7 +1024,7 @@ power_attr_ro(pm_trace_dev_match);
static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", freeze_timeout_msecs);
+ return sysfs_emit(buf, "%u\n", freeze_timeout_msecs);
}
static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
@@ -883,6 +1044,34 @@ power_attr(pm_freeze_timeout);
#endif /* CONFIG_FREEZER*/
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+bool filesystem_freeze_enabled = false;
+
+static ssize_t freeze_filesystems_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled);
+}
+
+static ssize_t freeze_filesystems_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ filesystem_freeze_enabled = !!val;
+ return n;
+}
+
+power_attr(freeze_filesystems);
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -913,6 +1102,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_FREEZER
&pm_freeze_timeout_attr.attr,
#endif
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ &freeze_filesystems_attr.attr,
+#endif
NULL,
};
@@ -931,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = {
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
-static int __init pm_start_workqueue(void)
+static int __init pm_start_workqueues(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0);
+ if (!pm_wq)
+ return -ENOMEM;
+
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0);
+ if (!pm_fs_sync_wq) {
+ destroy_workqueue(pm_wq);
+ return -ENOMEM;
+ }
+#endif
- return pm_wq ? 0 : -ENOMEM;
+ return 0;
}
static int __init pm_init(void)
{
- int error = pm_start_workqueue();
+ int error = pm_start_workqueues();
if (error)
return error;
hibernate_image_size_init();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b4f433943209..75b63843886e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -6,6 +6,7 @@
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
+#include <linux/crypto.h>
struct swsusp_info {
struct new_utsname uts;
@@ -17,6 +18,11 @@ struct swsusp_info {
unsigned long size;
} __aligned(PAGE_SIZE);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern int pm_sleep_fs_sync(void);
+extern bool filesystem_freeze_enabled;
+#endif
+
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
extern void __init hibernate_reserved_size_init(void);
@@ -26,9 +32,6 @@ extern void __init hibernate_image_size_init(void);
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
-extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
-extern int arch_hibernation_header_restore(void *addr);
-
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
@@ -41,8 +44,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info)
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int hibernate_resume_nonboot_cpu_disable(void);
-
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -59,6 +60,10 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
+extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
+/* kernel/power/swap.c */
+extern unsigned int swsusp_header_flags;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -71,10 +76,14 @@ extern void enable_restore_image_protection(void);
static inline void enable_restore_image_protection(void) {}
#endif /* CONFIG_STRICT_KERNEL_RWX */
+extern bool hibernation_in_progress(void);
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
+
+static inline bool hibernation_in_progress(void) { return false; }
#endif /* !CONFIG_HIBERNATION */
#define power_attr(_name) \
@@ -110,7 +119,7 @@ extern int hibernate_preallocate_memory(void);
extern void clear_or_poison_free_pages(void);
-/**
+/*
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
* (PBEs) which is the main data structure of swsusp.
@@ -153,7 +162,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
extern unsigned long snapshot_get_image_size(void);
extern int snapshot_read_next(struct snapshot_handle *handle);
extern int snapshot_write_next(struct snapshot_handle *handle);
-extern void snapshot_write_finalize(struct snapshot_handle *handle);
+int snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
extern bool hibernate_acquire(void);
@@ -167,19 +176,35 @@ extern int swsusp_swap_in_use(void);
* Flags that can be passed from the hibernatig hernel to the "boot" kernel in
* the image header.
*/
+#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
#define SF_HW_SIG 8
+/*
+ * Bit to indicate the compression algorithm to be used(for LZ4). The same
+ * could be checked while saving/loading image to/from disk to use the
+ * corresponding algorithms.
+ *
+ * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use
+ * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4.
+ *
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4
+ */
+#define SF_COMPRESSION_ALG_LZ4 16
+
/* kernel/power/hibernate.c */
-extern int swsusp_check(void);
+int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(fmode_t);
+void swsusp_close(void);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
#endif
struct __kernel_old_timeval;
@@ -325,3 +350,5 @@ static inline void pm_sleep_enable_secondary_cpus(void)
suspend_enable_secondary_cpus();
cpuidle_resume();
}
+
+void dpm_save_errno(int err);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 562aa0e450ed..1f306f158696 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key)
+static void handle_poweroff(u8 key)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b7e7798637b8..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -6,9 +6,6 @@
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
@@ -30,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -39,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -53,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -86,28 +86,26 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
if (wq_busy)
- show_all_workqueues();
+ show_freezable_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -132,17 +130,14 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
- pm_wakeup_clear(true);
- pr_info("Freezing user space processes ... ");
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
@@ -171,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -193,19 +183,17 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
oom_killer_enable();
- pr_info("Restarting tasks ... ");
+ pr_info("Restarting tasks: Starting\n");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
- cpuset_wait_for_hotplug();
-
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
@@ -220,7 +208,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- pr_cont("done.\n");
+ pr_info("Restarting tasks: Done\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -229,7 +217,7 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- pr_info("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ...\n");
thaw_workqueues();
@@ -241,5 +229,5 @@ void thaw_kernel_threads(void)
read_unlock(&tasklist_lock);
schedule();
- pr_cont("done.\n");
+ pr_info("Done restarting kernel threads.\n");
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index ec7e1e85923e..f7d8064e9adc 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c