summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec24
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/bounds.c1
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/bpf/arraymap.c38
-rw-r--r--kernel/bpf/bpf_insn_array.c304
-rw-r--r--kernel/bpf/bpf_local_storage.c235
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/cgroup.c8
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/disasm.c3
-rw-r--r--kernel/bpf/hashtab.c67
-rw-r--r--kernel/bpf/helpers.c302
-rw-r--r--kernel/bpf/inode.c15
-rw-r--r--kernel/bpf/liveness.c42
-rw-r--r--kernel/bpf/log.c3
-rw-r--r--kernel/bpf/range_tree.c21
-rw-r--r--kernel/bpf/ringbuf.c114
-rw-r--r--kernel/bpf/rqspinlock.c90
-rw-r--r--kernel/bpf/stackmap.c62
-rw-r--r--kernel/bpf/stream.c159
-rw-r--r--kernel/bpf/syscall.c89
-rw-r--r--kernel/bpf/trampoline.c83
-rw-r--r--kernel/bpf/verifier.c983
-rw-r--r--kernel/cgroup/cgroup.c92
-rw-r--r--kernel/cgroup/cpuset-internal.h13
-rw-r--r--kernel/cgroup/cpuset.c357
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/cpu_pm.c12
-rw-r--r--kernel/crash_reserve.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c47
-rw-r--r--kernel/dma/contiguous.c11
-rw-r--r--kernel/dma/direct.c4
-rw-r--r--kernel/dma/dummy.c13
-rw-r--r--kernel/dma/map_benchmark.c2
-rw-r--r--kernel/dma/mapping.c26
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/dma/pool.c2
-rw-r--r--kernel/dma/swiotlb.c2
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c74
-rw-r--r--kernel/futex/waitwake.c9
-rw-r--r--kernel/hung_task.c56
-rw-r--r--kernel/irq/generic-chip.c14
-rw-r--r--kernel/irq/irqdesc.c6
-rw-r--r--kernel/irq/pm.c11
-rw-r--r--kernel/kallsyms.c5
-rw-r--r--kernel/kexec_core.c161
-rw-r--r--kernel/kexec_handover_internal.h20
-rw-r--r--kernel/kstack_erase.c2
-rw-r--r--kernel/ksysfs.c68
-rw-r--r--kernel/livepatch/core.c8
-rw-r--r--kernel/liveupdate/Kconfig75
-rw-r--r--kernel/liveupdate/Makefile12
-rw-r--r--kernel/liveupdate/kexec_handover.c (renamed from kernel/kexec_handover.c)716
-rw-r--r--kernel/liveupdate/kexec_handover_debug.c (renamed from kernel/kexec_handover_debug.c)0
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c221
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h55
-rw-r--r--kernel/liveupdate/luo_core.c450
-rw-r--r--kernel/liveupdate/luo_file.c889
-rw-r--r--kernel/liveupdate/luo_internal.h110
-rw-r--r--kernel/liveupdate/luo_session.c646
-rw-r--r--kernel/locking/locktorture.c8
-rw-r--r--kernel/module/main.c2
-rw-r--r--kernel/panic.c52
-rw-r--r--kernel/printk/internal.h53
-rw-r--r--kernel/printk/nbcon.c174
-rw-r--r--kernel/printk/printk.c307
-rw-r--r--kernel/printk/printk_ringbuffer.c67
-rw-r--r--kernel/rcu/Kconfig.debug15
-rw-r--r--kernel/rcu/rcutorture.c76
-rw-r--r--kernel/rcu/refscale.c379
-rw-r--r--kernel/rcu/srcutiny.c13
-rw-r--r--kernel/rcu/srcutree.c130
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/update.c8
-rw-r--r--kernel/relay.c33
-rw-r--r--kernel/resource.c10
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/deadline.c54
-rw-r--r--kernel/sched/ext.c1067
-rw-r--r--kernel/sched/ext_idle.c43
-rw-r--r--kernel/sched/ext_internal.h29
-rw-r--r--kernel/sched/fair.c3
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/smp.c22
-rw-r--r--kernel/sysctl.c649
-rw-r--r--kernel/time/jiffies.c125
-rw-r--r--kernel/time/sched_clock.c22
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/trace/Kconfig40
-rw-r--r--kernel/trace/Makefile17
-rw-r--r--kernel/trace/blktrace.c539
-rw-r--r--kernel/trace/bpf_trace.c48
-rw-r--r--kernel/trace/fgraph.c22
-rw-r--r--kernel/trace/fprobe.c303
-rw-r--r--kernel/trace/ftrace.c49
-rw-r--r--kernel/trace/pid_list.c30
-rw-r--r--kernel/trace/pid_list.h1
-rw-r--r--kernel/trace/ring_buffer.c103
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/reactor_panic.c6
-rw-r--r--kernel/trace/rv/reactor_printk.c6
-rw-r--r--kernel/trace/rv/rv.c102
-rw-r--r--kernel/trace/rv/rv.h6
-rw-r--r--kernel/trace/rv/rv_reactors.c78
-rw-r--r--kernel/trace/trace.c910
-rw-r--r--kernel/trace/trace.h239
-rw-r--r--kernel/trace/trace_dynevent.c11
-rw-r--r--kernel/trace/trace_entries.h15
-rw-r--r--kernel/trace/trace_eprobe.c127
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c145
-rw-r--r--kernel/trace/trace_events_synth.c3
-rw-r--r--kernel/trace/trace_events_trigger.c410
-rw-r--r--kernel/trace/trace_events_user.c6
-rw-r--r--kernel/trace/trace_fprobe.c6
-rw-r--r--kernel/trace/trace_functions.c10
-rw-r--r--kernel/trace/trace_functions_graph.c223
-rw-r--r--kernel/trace/trace_irqsoff.c30
-rw-r--r--kernel/trace/trace_kdb.c2
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_osnoise.c12
-rw-r--r--kernel/trace/trace_output.c51
-rw-r--r--kernel/trace/trace_output.h11
-rw-r--r--kernel/trace/trace_probe.c7
-rw-r--r--kernel/trace/trace_probe.h4
-rw-r--r--kernel/trace/trace_sched_wakeup.c24
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_syscalls.c935
-rw-r--r--kernel/trace/trace_uprobe.c82
-rw-r--r--kernel/vmcore_info.c17
-rw-r--r--kernel/watchdog.c62
-rw-r--r--kernel/workqueue.c86
140 files changed, 10942 insertions, 4152 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 54e581072617..15632358bcf7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -94,30 +94,6 @@ config KEXEC_JUMP
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
-config KEXEC_HANDOVER
- bool "kexec handover"
- depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
- depends on !DEFERRED_STRUCT_PAGE_INIT
- select MEMBLOCK_KHO_SCRATCH
- select KEXEC_FILE
- select DEBUG_FS
- select LIBFDT
- select CMA
- help
- Allow kexec to hand over state across kernels by generating and
- passing additional metadata to the target kernel. This is useful
- to keep data or state alive across the kexec. For this to work,
- both source and target kernels need to have this option enabled.
-
-config KEXEC_HANDOVER_DEBUG
- bool "Enable Kexec Handover debug checks"
- depends on KEXEC_HANDOVER
- help
- This option enables extra sanity checks for the Kexec Handover
- subsystem. Since, KHO performance is crucial in live update
- scenarios and the extra code might be adding overhead it is
- only optionally enabled.
-
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 9fe722305c9b..e83669841b8c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-y += liveupdate/
obj-y += dma/
obj-y += entry/
obj-y += unwind/
@@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
-obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
-obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 29b2cd00df2c..02b619eb6106 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -6,6 +6,7 @@
*/
#define __GENERATING_BOUNDS_H
+#define COMPILE_OFFSETS
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7fd0badfacb1..232cbc97434d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1074ac4459f2..872dc0e41c65 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
return -EINVAL;
}
- ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
+ ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags);
if (IS_ERR_VALUE(ret))
return ret;
if ((ret >> 32) == ((ret + len - 1) >> 32))
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 80b1765a3159..1eeb31c5b317 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
}
/* Called from syscall */
-static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
- if (index >= array->map.max_entries) {
+ if (index >= map->max_entries) {
*next = 0;
return 0;
}
- if (index == array->map.max_entries - 1)
+ if (index == map->max_entries - 1)
return -ENOENT;
*next = index + 1;
@@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free fields other than timer and workqueue
- * on uref dropping to zero.
- */
- if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
- for (i = 0; i < array->map.max_entries; i++) {
- if (btf_record_has_field(map->record, BPF_TIMER))
- bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
- if (btf_record_has_field(map->record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
- if (btf_record_has_field(map->record, BPF_TASK_WORK))
- bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
- }
- }
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
@@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
@@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
@@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = perf_event_fd_array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
@@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
@@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
new file mode 100644
index 000000000000..c96630cb75bf
--- /dev/null
+++ b/kernel/bpf/bpf_insn_array.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+
+struct bpf_insn_array {
+ struct bpf_map map;
+ atomic_t used;
+ long *ips;
+ DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
+};
+
+#define cast_insn_array(MAP_PTR) \
+ container_of((MAP_PTR), struct bpf_insn_array, map)
+
+#define INSN_DELETED ((u32)-1)
+
+static inline u64 insn_array_alloc_size(u32 max_entries)
+{
+ const u64 base_size = sizeof(struct bpf_insn_array);
+ const u64 entry_size = sizeof(struct bpf_insn_array_value);
+
+ return base_size + max_entries * (entry_size + sizeof(long));
+}
+
+static int insn_array_alloc_check(union bpf_attr *attr)
+{
+ u32 value_size = sizeof(struct bpf_insn_array_value);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != value_size || attr->map_flags != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void insn_array_free(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ bpf_map_area_free(insn_array);
+}
+
+static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
+{
+ u64 size = insn_array_alloc_size(attr->max_entries);
+ struct bpf_insn_array *insn_array;
+
+ insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
+ if (!insn_array)
+ return ERR_PTR(-ENOMEM);
+
+ /* ips are allocated right after the insn_array->values[] array */
+ insn_array->ips = (void *)&insn_array->values[attr->max_entries];
+
+ bpf_map_init_from_attr(&insn_array->map, attr);
+
+ /* BPF programs aren't allowed to write to the map */
+ insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
+
+ return &insn_array->map;
+}
+
+static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return NULL;
+
+ return &insn_array->values[index];
+}
+
+static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+ struct bpf_insn_array_value val = {};
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return -E2BIG;
+
+ if (unlikely(map_flags & BPF_NOEXIST))
+ return -EEXIST;
+
+ copy_map_value(map, &val, value);
+ if (val.jitted_off || val.xlated_off)
+ return -EINVAL;
+
+ insn_array->values[index].orig_off = val.orig_off;
+
+ return 0;
+}
+
+static long insn_array_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static int insn_array_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ if (!btf_type_is_i64(value_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u64 insn_array_mem_usage(const struct bpf_map *map)
+{
+ return insn_array_alloc_size(map->max_entries);
+}
+
+static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ if ((off % sizeof(long)) != 0 ||
+ (off / sizeof(long)) >= map->max_entries)
+ return -EINVAL;
+
+ /* from BPF's point of view, this map is a jump table */
+ *imm = (unsigned long)insn_array->ips + off;
+
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
+
+const struct bpf_map_ops insn_array_map_ops = {
+ .map_alloc_check = insn_array_alloc_check,
+ .map_alloc = insn_array_alloc,
+ .map_free = insn_array_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = insn_array_lookup_elem,
+ .map_update_elem = insn_array_update_elem,
+ .map_delete_elem = insn_array_delete_elem,
+ .map_check_btf = insn_array_check_btf,
+ .map_mem_usage = insn_array_mem_usage,
+ .map_direct_value_addr = insn_array_map_direct_value_addr,
+ .map_btf_id = &insn_array_btf_ids[0],
+};
+
+static inline bool is_frozen(struct bpf_map *map)
+{
+ guard(mutex)(&map->freeze_mutex);
+
+ return map->frozen;
+}
+
+static bool is_insn_array(const struct bpf_map *map)
+{
+ return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
+}
+
+static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
+ const struct bpf_prog *prog)
+{
+ u32 off;
+ int i;
+
+ for (i = 0; i < insn_array->map.max_entries; i++) {
+ off = insn_array->values[i].orig_off;
+
+ if (off >= prog->len)
+ return false;
+
+ if (off > 0) {
+ if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ struct bpf_insn_array_value *values = insn_array->values;
+ int i;
+
+ if (!is_frozen(map))
+ return -EINVAL;
+
+ if (!valid_offsets(insn_array, prog))
+ return -EINVAL;
+
+ /*
+ * There can be only one program using the map
+ */
+ if (atomic_xchg(&insn_array->used, 1))
+ return -EBUSY;
+
+ /*
+ * Reset all the map indexes to the original values. This is needed,
+ * e.g., when a replay of verification with different log level should
+ * be performed.
+ */
+ for (i = 0; i < map->max_entries; i++)
+ values[i].xlated_off = values[i].orig_off;
+
+ return 0;
+}
+
+int bpf_insn_array_ready(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (!insn_array->ips[i])
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+void bpf_insn_array_release(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ atomic_set(&insn_array->used, 0);
+}
+
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off <= off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ insn_array->values[i].xlated_off += len - 1;
+ }
+}
+
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off < off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (insn_array->values[i].xlated_off < off + len)
+ insn_array->values[i].xlated_off = INSN_DELETED;
+ else
+ insn_array->values[i].xlated_off -= len;
+ }
+}
+
+/*
+ * This function is called by JITs. The image is the real program
+ * image, the offsets array set up the xlated -> jitted mapping.
+ * The offsets[xlated] offset should point to the beginning of
+ * the jitted instruction.
+ */
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+ struct bpf_insn_array *insn_array;
+ struct bpf_map *map;
+ u32 xlated_off;
+ int i, j;
+
+ if (!offsets || !image)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (!is_insn_array(map))
+ continue;
+
+ insn_array = cast_insn_array(map);
+ for (j = 0; j < map->max_entries; j++) {
+ xlated_off = insn_array->values[j].xlated_off;
+ if (xlated_off == INSN_DELETED)
+ continue;
+ if (xlated_off < prog->aux->subprog_start)
+ continue;
+ xlated_off -= prog->aux->subprog_start;
+ if (xlated_off >= prog->len)
+ continue;
+
+ insn_array->values[j].jitted_off = offsets[xlated_off];
+ insn_array->ips[j] = (long)(image + offsets[xlated_off]);
+ }
+ }
+}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b931fbceb54d..e2fe6c32822b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
- if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->bpf_ma) {
- selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
- if (selem)
- /* Keep the original bpf_map_kzalloc behavior
- * before started using the bpf_mem_cache_alloc.
- *
- * No need to use zero_map_value. The bpf_selem_free()
- * only does bpf_mem_cache_free when there is
- * no other bpf prog is using the selem.
- */
- memset(SDATA(selem)->data, 0, smap->map.value_size);
+ if (smap->use_kmalloc_nolock) {
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
} else {
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
gfp_flags | __GFP_NOWARN);
}
if (selem) {
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return selem;
}
- if (charge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
return NULL;
}
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- bpf_mem_cache_raw_free(local_storage);
+ kfree_nolock(local_storage);
}
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
@@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
call_rcu(rcu, bpf_local_storage_free_rcu);
}
-/* Handle bpf_ma == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *smap,
- bool bpf_ma, bool reuse_now)
+ bool reuse_now)
{
if (!local_storage)
return;
- if (!bpf_ma) {
+ if (!local_storage->use_kmalloc_nolock) {
__bpf_local_storage_free(local_storage, reuse_now);
return;
}
- if (!reuse_now) {
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_trace_rcu);
+ if (reuse_now) {
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
return;
}
- if (smap)
- bpf_mem_cache_free(&smap->storage_ma, local_storage);
- else
- /* smap could be NULL if the selem that triggered
- * this 'local_storage' creation had been long gone.
- * In this case, directly do call_rcu().
- */
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
}
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
-/* Handle bpf_ma == false */
+/* Handle use_kmalloc_nolock == false */
static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
@@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
- bpf_mem_cache_raw_free(selem);
+ kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
- struct bpf_local_storage_map *smap,
bool reuse_now)
{
- if (!smap->bpf_ma) {
- /* Only task storage has uptrs and task storage
- * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
- * for task storage, so this bpf_obj_free_fields() won't unpin
- * any uptr.
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ if (!smap->use_kmalloc_nolock) {
+ /*
+ * No uptr will be unpin even when reuse_now == false since uptr
+ * is only supported in task local storage, where
+ * smap->use_kmalloc_nolock == true.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
@@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
}
if (reuse_now) {
- /* reuse_now == true only happens when the storage owner
- * (e.g. task_struct) is being destructed or the map itself
- * is being destructed (ie map_free). In both cases,
- * no bpf prog can have a hold on the selem. It is
- * safe to unpin the uptrs and free the selem now.
- */
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- /* Instead of using the vanilla call_rcu(),
- * bpf_mem_cache_free will be able to reuse selem
- * immediately.
+ /*
+ * While it is okay to call bpf_obj_free_fields() that unpins uptr when
+ * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
*/
- bpf_mem_cache_free(&smap->selem_ma, selem);
+ call_rcu(&selem->rcu, bpf_selem_free_rcu);
return;
}
@@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
{
struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
struct hlist_node *n;
/* The "_safe" iteration is needed.
@@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
* but bpf_selem_free will use the selem->rcu_head
* which is union-ized with the selem->free_node.
*/
- hlist_for_each_entry_safe(selem, n, list, free_node) {
- smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- bpf_selem_free(selem, smap, reuse_now);
- }
+ hlist_for_each_entry_safe(selem, n, list, free_node)
+ bpf_selem_free(selem, reuse_now);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, struct hlist_head *free_selem_list)
+ struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
* The owner may be freed once the last selem is unlinked
* from local_storage.
*/
- if (uncharge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
@@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
return free_local_storage;
}
-static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *storage_smap,
- struct bpf_local_storage_elem *selem)
-{
-
- struct bpf_local_storage_map *selem_smap;
-
- /* local_storage->smap may be NULL. If it is, get the bpf_ma
- * from any selem in the local_storage->list. The bpf_ma of all
- * local_storage and selem should have the same value
- * for the same map type.
- *
- * If the local_storage->list is already empty, the caller will not
- * care about the bpf_ma value also because the caller is not
- * responsible to free the local_storage.
- */
-
- if (storage_smap)
- return storage_smap->bpf_ma;
-
- if (!selem) {
- struct hlist_node *n;
-
- n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
- bpf_rcu_lock_held());
- if (!n)
- return false;
-
- selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
- }
- selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
- return selem_smap->bpf_ma;
-}
-
static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool bpf_ma, free_local_storage = false;
+ bool free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
@@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
- storage_smap = rcu_dereference_check(local_storage->smap,
- bpf_rcu_lock_held());
- bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, &selem_free_list);
+ local_storage, selem, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&selem_free_list, reuse_now);
if (free_local_storage)
- bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
+ bpf_local_storage_free(local_storage, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
unsigned long flags;
raw_spin_lock_irqsave(&b->lock, flags);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
@@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->bpf_ma)
- storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ if (smap->use_kmalloc_nolock)
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
@@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner,
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
+ storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
bpf_selem_link_storage_nolock(storage, first_selem);
bpf_selem_link_map(smap, first_selem);
@@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner,
bpf_selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_local_storage_map_free() does a
- * synchronize_rcu_mult (waiting for both sleepable and
- * normal programs) before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
}
return 0;
uncharge:
- bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
+ bpf_local_storage_free(storage, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- bpf_selem_free(selem, smap, true);
+ bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- true, &old_selem_free_list);
+ &old_selem_free_list);
}
unlock:
@@ -664,7 +593,7 @@ unlock:
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
- bpf_selem_free(alloc_selem, smap, true);
+ bpf_selem_free(alloc_selem, true);
}
return err ? ERR_PTR(err) : SDATA(selem);
}
@@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
- struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
- bool bpf_ma, free_storage = false;
+ bool free_storage = false;
HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
- storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
- bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
-
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added to or deleted from the
@@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, &free_selem_list);
+ local_storage, selem, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&free_selem_list, true);
if (free_storage)
- bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
+ bpf_local_storage_free(local_storage, true);
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
return usage;
}
-/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
- * A deadlock free allocator is useful for storage that the bpf prog can easily
- * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
- * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
- * memory immediately. To be reuse-immediate safe, the owner destruction
- * code path needs to go through a rcu grace period before calling
- * bpf_local_storage_destroy().
- *
- * When bpf_ma == false, the kmalloc and kfree are used.
- */
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache,
- bool bpf_ma)
+ bool use_kmalloc_nolock)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
* preemptible context. Thus, enforce all storages to use
- * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+ * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
*/
- smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
- if (smap->bpf_ma) {
- err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
- if (err)
- goto free_smap;
-
- err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
- if (err) {
- bpf_mem_alloc_destroy(&smap->selem_ma);
- goto free_smap;
- }
- }
+ smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
@@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
- if (smap->bpf_ma) {
+ if (smap->use_kmalloc_nolock) {
rcu_barrier_tasks_trace();
- if (!rcu_trace_implies_rcu_gp())
- rcu_barrier();
- bpf_mem_alloc_destroy(&smap->selem_ma);
- bpf_mem_alloc_destroy(&smap->storage_ma);
+ rcu_barrier();
}
kvfree(smap->buckets);
bpf_map_area_free(smap);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0a59df1c550a..7cb6e8d4282c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity)
BTF_ID(func, bpf_lsm_audit_rule_match)
#endif
BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_SET_END(bpf_lsm_disabled_hooks)
/* List of LSM hooks that should operate on 'current' cgroup regardless
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index a41e6730edcf..278490683d28 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata)
map = __bpf_map_inc_not_zero(&st_map->map, false);
return !IS_ERR(map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_get);
void bpf_struct_ops_put(const void *kdata)
{
@@ -1173,6 +1174,7 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_put);
u32 bpf_struct_ops_id(const void *kdata)
{
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 248f517d66d0..69988af44b37 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int *uaddrlen,
enum cgroup_bpf_attach_type atype,
void *t_ctx,
@@ -1676,7 +1676,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
.uaddr = uaddr,
.t_ctx = t_ctx,
};
- struct sockaddr_storage unspec;
+ struct sockaddr_storage storage;
struct cgroup *cgrp;
int ret;
@@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
return 0;
if (!ctx.uaddr) {
- memset(&unspec, 0, sizeof(unspec));
- ctx.uaddr = (struct sockaddr *)&unspec;
+ memset(&storage, 0, sizeof(storage));
+ ctx.uaddr = (struct sockaddr_unsized *)&storage;
ctx.uaddrlen = 0;
} else {
ctx.uaddrlen = *uaddrlen;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d595fe512498..c8ae6ab31651 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
bpf_prog_clone_free(fp_other);
}
+static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_map *map;
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ bpf_insn_array_adjust(map, off, len);
+ }
+#endif
+}
+
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
struct bpf_insn insn_buff[16], aux[2];
@@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
clone = tmp;
insn_delta = rewritten - 1;
+ /* Instructions arrays must be updated using absolute xlated offsets */
+ adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
+
/* Walk new program and skip insns we just inserted. */
insn = clone->insnsi + i + insn_delta;
insn_cnt += insn_delta;
@@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JA | BPF_X] = true,
[BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
@@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
-int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *addr1, void *addr2)
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+ enum bpf_text_poke_type new_t, void *old_addr,
+ void *new_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 20883c6b1546..f8a3c7eb451e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
+ verbose(cbs->private_data, "(%02x) gotox r%d\n",
+ insn->code, insn->dst_reg);
} else if (insn->code == (BPF_JMP | BPF_JCOND) &&
insn->src_reg == BPF_MAY_GOTO) {
verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c2fcd0cd51e5..c8a9b27f8663 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
-{
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
- bpf_obj_free_task_work(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
-}
-
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
@@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- htab_free_internal_structs(htab, elem);
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
+ void *ptr;
+
if (!onallcpus) {
/* copy true value_size bytes */
- copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
+ ptr = this_cpu_ptr(pptr);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value_long(&htab->map, ptr, value + off);
+ bpf_obj_free_fields(htab->map.record, ptr);
off += size;
}
}
@@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
- /* We only free timer on uref dropping to zero */
- htab_free_internal_structs(htab, l);
+ /* We only free internal structs on uref dropping to zero */
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
@@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We only free timer and workqueue on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
- if (!htab_is_prealloc(htab))
- htab_free_malloced_internal_structs(htab);
- else
- htab_free_prealloced_internal_structs(htab);
- }
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ if (htab_is_prealloc(htab))
+ htab_free_prealloced_internal_structs(htab);
+ else
+ htab_free_malloced_internal_structs(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e4007fea4909..db72b96f9c8c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -28,6 +28,7 @@
#include <linux/verification.h>
#include <linux/task_work.h>
#include <linux/irq_work.h>
+#include <linux/buildid.h>
#include "../../lib/kstrtox.h"
@@ -42,8 +43,7 @@
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_delete_elem(map, key);
}
@@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
+ preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -792,6 +791,7 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1660,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.arg2_btf_id = BPF_PTR_POISON,
};
+struct bpf_dynptr_file_impl {
+ struct freader freader;
+ /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
+ u64 offset;
+ u64 size;
+};
+
/* Since the upper 8 bits of dynptr->size is reserved, the
* maximum supported size is 2^24 - 1.
*/
@@ -1688,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}
-u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ return df->size;
+ }
+
return ptr->size & DYNPTR_SIZE_MASK;
}
-static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->offset += off;
+ return;
+ }
+ ptr->offset += off;
+}
+
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
{
u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
- ptr->size = new_size | metadata;
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->size = new_size;
+ return;
+ }
+ ptr->size = (u32)new_size | metadata;
}
-int bpf_dynptr_check_size(u32 size)
+int bpf_dynptr_check_size(u64 size)
{
return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
}
+static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
+{
+ const void *ptr;
+
+ if (!buf)
+ return -EINVAL;
+
+ df->freader.buf = buf;
+ df->freader.buf_sz = len;
+ ptr = freader_fetch(&df->freader, offset + df->offset, len);
+ if (!ptr)
+ return df->freader.err;
+
+ if (ptr != buf) /* Force copying into the buffer */
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
enum bpf_dynptr_type type, u32 offset, u32 size)
{
@@ -1719,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1754,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
-static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
- u32 offset, u64 flags)
+static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
+ u64 offset, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1785,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
case BPF_DYNPTR_TYPE_SKB_META:
memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
return 0;
+ case BPF_DYNPTR_TYPE_FILE:
+ return bpf_file_fetch_bytes(src->data, offset, dst, len);
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
}
}
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
- u32, offset, u64, flags)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
+ u64, offset, u64, flags)
{
return __bpf_dynptr_read(dst, len, src, offset, flags);
}
@@ -1808,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
- u32 len, u64 flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
+ u64 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1842,18 +1893,16 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
case BPF_DYNPTR_TYPE_SKB_META:
- if (flags)
- return -EINVAL;
- memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
- return 0;
+ return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+ len, flags);
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
}
}
-BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
- u32, len, u64, flags)
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
+ u64, len, u64, flags)
{
return __bpf_dynptr_write(dst, offset, src, len, flags);
}
@@ -1869,7 +1918,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
{
enum bpf_dynptr_type type;
int err;
@@ -2684,12 +2733,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
- void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
- u32 len = buffer__szk;
+ u64 len = buffer__szk;
int err;
if (!ptr->data)
@@ -2723,6 +2772,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_FILE:
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
+ return err ? NULL : buffer__opt;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2771,8 +2823,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
- void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2804,10 +2856,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
- u32 size;
+ u64 size;
if (!ptr->data || start > end)
return -EINVAL;
@@ -2817,7 +2869,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end
if (start > size || end > size)
return -ERANGE;
- ptr->offset += start;
+ bpf_dynptr_advance_offset(ptr, start);
bpf_dynptr_set_size(ptr, end - start);
return 0;
@@ -2840,7 +2892,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
+__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2877,14 +2929,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
* Copies data from source dynptr to destination dynptr.
* Returns 0 on success; negative error, otherwise.
*/
-__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
- struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
+ struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
{
struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
void *src_slice, *dst_slice;
char buf[256];
- u32 off;
+ u64 off;
src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
@@ -2906,7 +2958,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
off = 0;
while (off < size) {
- u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
+ u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
int err;
err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
@@ -2932,10 +2984,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
* at @offset with the constant byte @val.
* Returns 0 on success; negative error, otherwise.
*/
- __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val)
- {
+__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
- u32 chunk_sz, write_off;
+ u64 chunk_sz, write_off;
char buf[256];
void* slice;
int err;
@@ -2954,11 +3006,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
return err;
/* Non-linear data under the dynptr, write from a local buffer */
- chunk_sz = min_t(u32, sizeof(buf), size);
+ chunk_sz = min_t(u64, sizeof(buf), size);
memset(buf, val, chunk_sz);
for (write_off = 0; write_off < size; write_off += chunk_sz) {
- chunk_sz = min_t(u32, sizeof(buf), size - write_off);
+ chunk_sz = min_t(u64, sizeof(buf), size - write_off);
err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
if (err)
return err;
@@ -3678,34 +3730,21 @@ err_out:
return -EFAULT;
}
-/**
- * bpf_strnstr - Find the first substring in a length-limited string
- * @s1__ign: The string to be searched
- * @s2__ign: The string to search for
- * @len: the maximum number of characters to search
- *
- * Return:
- * * >=0 - Index of the first character of the first occurrence of @s2__ign
- * within the first @len characters of @s1__ign
- * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of the strings is too large
- * * %-ERANGE - One of the strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
+static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
+ bool ignore_case)
{
char c1, c2;
int i, j;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
- __get_kernel_nofault(&c2, s2__ign + j, char, err_out);
+ __get_kernel_nofault(&c2, s2 + j, char, err_out);
if (c2 == '\0')
return i;
/*
@@ -3715,7 +3754,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
*/
if (i + j == len)
break;
- __get_kernel_nofault(&c1, s1__ign + j, char, err_out);
+ __get_kernel_nofault(&c1, s1 + j, char, err_out);
+
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+
if (c1 == '\0')
return -ENOENT;
if (c1 != c2)
@@ -3725,7 +3770,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
return -E2BIG;
if (i + j == len)
return -ENOENT;
- s1__ign++;
+ s1++;
}
return -E2BIG;
err_out:
@@ -3747,8 +3792,69 @@ err_out:
*/
__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
- return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
+}
+
+/**
+ * bpf_strcasestr - Find the first substring in a string, ignoring the case of
+ * the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, false);
+}
+
+/**
+ * bpf_strncasestr - Find the first substring in a length-limited string,
+ * ignoring the case of the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, true);
+}
+
#ifdef CONFIG_KEYS
/**
* bpf_lookup_user_key - lookup a key by its serial
@@ -4206,6 +4312,54 @@ __bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
+static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_dynptr_file_impl *state;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ if (!state) {
+ bpf_dynptr_set_null(ptr);
+ return -ENOMEM;
+ }
+ state->offset = 0;
+ state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
+ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
+ bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
+ bpf_dynptr_set_rdonly(ptr);
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ if (!df)
+ return 0;
+
+ freader_cleanup(&df->freader);
+ bpf_mem_free(&bpf_global_ma, df);
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4376,13 +4530,17 @@ BTF_ID_FLAGS(func, bpf_strnlen);
BTF_ID_FLAGS(func, bpf_strspn);
BTF_ID_FLAGS(func, bpf_strcspn);
BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strcasestr);
BTF_ID_FLAGS(func, bpf_strnstr);
+BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -4423,7 +4581,7 @@ late_initcall(kfunc_init);
/* Get a pointer to dynptr data up to len bytes for read only access. If
* the dynptr doesn't have continuous data up to len bytes, return NULL.
*/
-const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
{
const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
@@ -4434,9 +4592,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
* the dynptr doesn't have continuous data up to len bytes, or the dynptr
* is read only, return NULL.
*/
-void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
{
if (__bpf_dynptr_is_rdonly(ptr))
return NULL;
return (void *)__bpf_dynptr_data(ptr, len);
}
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
+{
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, val);
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, val);
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, val);
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 81780bcf8d25..9f866a010dad 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -144,8 +144,7 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
struct inode *dir)
{
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
@@ -420,16 +419,12 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
struct dentry *dentry;
int ret;
- inode_lock(parent->d_inode);
- dentry = lookup_noperm(&QSTR(name), parent);
- if (IS_ERR(dentry)) {
- inode_unlock(parent->d_inode);
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry))
return PTR_ERR(dentry);
- }
ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
&bpf_iter_fops);
- dput(dentry);
- inode_unlock(parent->d_inode);
+ simple_done_creating(dentry);
return ret;
}
@@ -1080,7 +1075,7 @@ static void bpf_kill_super(struct super_block *sb)
{
struct bpf_mount_opts *opts = sb->s_fs_info;
- kill_litter_super(sb);
+ kill_anon_super(sb);
kfree(opts);
}
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 1e6538f59a78..60db5d655495 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -34,7 +34,7 @@
* - read and write marks propagation.
* - The propagation phase is a textbook live variable data flow analysis:
*
- * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
* state[cc, i].live_before =
* (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
*
@@ -54,7 +54,7 @@
* The equation for "must_write_acc" propagation looks as follows:
*
* state[cc, i].must_write_acc =
- * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)]
+ * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
* U state[cc, i].must_write
*
* (An intersection of all "must_write_acc" for instruction successors
@@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn)
__diag_push();
__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
-inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+/*
+ * Returns an array of instructions succ, with succ->items[0], ...,
+ * succ->items[n-1] with successor instructions, where n=succ->cnt
+ */
+inline struct bpf_iarray *
+bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
{
static const struct opcode_info {
bool can_jump;
@@ -474,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
_J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
#undef _J
};
+ struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = &prog->insnsi[idx];
const struct opcode_info *opcode_info;
- int i = 0, insn_sz;
+ struct bpf_iarray *succ, *jt;
+ int insn_sz;
+
+ jt = env->insn_aux_data[idx].jt;
+ if (unlikely(jt))
+ return jt;
+
+ /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
+ succ = env->succ;
+ succ->cnt = 0;
opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
if (opcode_info->can_fallthrough)
- succ[i++] = idx + insn_sz;
+ succ->items[succ->cnt++] = idx + insn_sz;
if (opcode_info->can_jump)
- succ[i++] = idx + bpf_jmp_offset(insn) + 1;
+ succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
- return i;
+ return succ;
}
__diag_pop();
@@ -524,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env,
this_subprog_start = callchain_subprog_start(callchain);
outer_instance = get_outer_instance(env, instance);
+ if (IS_ERR(outer_instance))
+ return PTR_ERR(outer_instance);
callsite = callchain->callsites[callchain->curframe - 1];
reset_stack_write_marks(env, outer_instance, callsite);
@@ -546,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env,
struct bpf_insn_aux_data *aux = env->insn_aux_data;
u64 new_before, new_after, must_write_acc;
struct per_frame_masks *insn, *succ_insn;
- u32 succ_num, s, succ[2];
+ struct bpf_iarray *succ;
+ u32 s;
bool changed;
- succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
- if (unlikely(succ_num == 0))
+ succ = bpf_insn_successors(env, insn_idx);
+ if (succ->cnt == 0)
return false;
changed = false;
@@ -562,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env,
* of successors plus all "must_write" slots of instruction itself.
*/
must_write_acc = U64_MAX;
- for (s = 0; s < succ_num; ++s) {
- succ_insn = get_frame_masks(instance, frame, succ[s]);
+ for (s = 0; s < succ->cnt; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ->items[s]);
new_after |= succ_insn->live_before;
must_write_acc &= succ_insn->must_write_acc;
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index f50533169cc3..a0c3b35de2ce 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
+ [PTR_TO_INSN] = "insn",
[PTR_TO_MAP_KEY] = "map_key",
[CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
@@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "xdp";
case BPF_DYNPTR_TYPE_SKB_META:
return "skb_meta";
+ case BPF_DYNPTR_TYPE_FILE:
+ return "file";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 37b80a23ae1a..99c63d982c5d 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -2,7 +2,6 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/interval_tree_generic.h>
#include <linux/slab.h>
-#include <linux/bpf_mem_alloc.h>
#include <linux/bpf.h>
#include "range_tree.h"
@@ -21,7 +20,7 @@
* in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
*
* The implementation relies on external lock to protect rbtree-s.
- * The alloc/free of range_node-s is done via bpf_mem_alloc.
+ * The alloc/free of range_node-s is done via kmalloc_nolock().
*
* bpf arena is using range_tree to represent unallocated slots.
* At init time:
@@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
range_it_insert(rn, rt);
/* Add a range */
- migrate_disable();
- new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
- migrate_enable();
+ new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
@@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
} else {
/* in the middle of the clearing range */
range_it_remove(rn, rt);
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, rn);
- migrate_enable();
+ kfree_nolock(rn);
}
}
return 0;
@@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
range_it_remove(right, rt);
left->rn_last = right->rn_last;
range_it_insert(left, rt);
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, right);
- migrate_enable();
+ kfree_nolock(right);
} else if (left) {
/* Combine with the left range */
range_it_remove(left, rt);
@@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
right->rn_start = start;
range_it_insert(right, rt);
} else {
- migrate_disable();
- left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
- migrate_enable();
+ left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!left)
return -ENOMEM;
left->rn_start = start;
@@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt)
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
- bpf_mem_free(&bpf_global_ma, rn);
+ kfree_nolock(rn);
}
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index d706c4b7f532..f6a075ffac63 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -13,7 +13,7 @@
#include <linux/btf_ids.h>
#include <asm/rqspinlock.h>
-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
#define RINGBUF_PGOFF \
@@ -30,6 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
+ bool overwrite_mode;
rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
@@ -73,6 +74,7 @@ struct bpf_ringbuf {
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
unsigned long pending_pos;
+ unsigned long overwrite_pos; /* position after the last overwritten record */
char data[] __aligned(PAGE_SIZE);
};
@@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work)
* considering that the maximum value of data_sz is (4GB - 1), there
* will be no overflow, so just note the size limit in the comments.
*/
-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
{
struct bpf_ringbuf *rb;
@@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->consumer_pos = 0;
rb->producer_pos = 0;
rb->pending_pos = 0;
+ rb->overwrite_mode = overwrite_mode;
return rb;
}
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
+ bool overwrite_mode = false;
struct bpf_ringbuf_map *rb_map;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_RB_OVERWRITE) {
+ if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
+ return ERR_PTR(-EINVAL);
+ overwrite_mode = true;
+ }
+
if (attr->key_size || attr->value_size ||
!is_power_of_2(attr->max_entries) ||
!PAGE_ALIGNED(attr->max_entries))
@@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
bpf_map_init_from_attr(&rb_map->map, attr);
- rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
if (!rb_map->rb) {
bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
@@ -295,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
}
+/*
+ * Return an estimate of the available data in the ring buffer.
+ * Note: the returned value can exceed the actual ring buffer size because the
+ * function is not synchronized with the producer. The producer acquires the
+ * ring buffer's spinlock, but this function does not.
+ */
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
- unsigned long cons_pos, prod_pos;
+ unsigned long cons_pos, prod_pos, over_pos;
cons_pos = smp_load_acquire(&rb->consumer_pos);
- prod_pos = smp_load_acquire(&rb->producer_pos);
- return prod_pos - cons_pos;
+
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = smp_load_acquire(&rb->overwrite_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - max(cons_pos, over_pos);
+ } else {
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+ }
}
static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
@@ -404,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
return (void*)((addr & PAGE_MASK) - off);
}
+static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
+ unsigned long new_prod_pos,
+ unsigned long cons_pos,
+ unsigned long pend_pos)
+{
+ /*
+ * No space if oldest not yet committed record until the newest
+ * record span more than (ringbuf_size - 1).
+ */
+ if (new_prod_pos - pend_pos > rb->mask)
+ return false;
+
+ /* Ok, we have space in overwrite mode */
+ if (unlikely(rb->overwrite_mode))
+ return true;
+
+ /*
+ * No space if producer position advances more than (ringbuf_size - 1)
+ * ahead of consumer position when not in overwrite mode.
+ */
+ if (new_prod_pos - cons_pos > rb->mask)
+ return false;
+
+ return true;
+}
+
+static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
+{
+ hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
+ return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
+}
+
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
- unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
struct bpf_ringbuf_hdr *hdr;
- u32 len, pg_off, tmp_size, hdr_len;
+ u32 len, pg_off, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@@ -431,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
hdr_len = READ_ONCE(hdr->len);
if (hdr_len & BPF_RINGBUF_BUSY_BIT)
break;
- tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
- tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
- pend_pos += tmp_size;
+ pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
}
rb->pending_pos = pend_pos;
- /* check for out of ringbuf space:
- * - by ensuring producer position doesn't advance more than
- * (ringbuf_size - 1) ahead
- * - by ensuring oldest not yet committed record until newest
- * record does not span more than (ringbuf_size - 1)
- */
- if (new_prod_pos - cons_pos > rb->mask ||
- new_prod_pos - pend_pos > rb->mask) {
+ if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
+ /*
+ * In overwrite mode, advance overwrite_pos when the ring buffer is full.
+ * The key points are to stay on record boundaries and consume enough records
+ * to fit the new one.
+ */
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = rb->overwrite_pos;
+ while (new_prod_pos - over_pos > rb->mask) {
+ hdr = (void *)rb->data + (over_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ /*
+ * The bpf_ringbuf_has_space() check above ensures we won’t
+ * step over a record currently being worked on by another
+ * producer.
+ */
+ over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ /*
+ * smp_store_release(&rb->producer_pos, new_prod_pos) at
+ * the end of the function ensures that when consumer sees
+ * the updated rb->producer_pos, it always sees the updated
+ * rb->overwrite_pos, so when consumer reads overwrite_pos
+ * after smp_load_acquire(r->producer_pos), the overwrite_pos
+ * will always be valid.
+ */
+ WRITE_ONCE(rb->overwrite_pos, over_pos);
+ }
+
hdr = (void *)rb->data + (prod_pos & rb->mask);
pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
hdr->len = size | BPF_RINGBUF_BUSY_BIT;
@@ -578,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
return smp_load_acquire(&rb->producer_pos);
+ case BPF_RB_OVERWRITE_POS:
+ return smp_load_acquire(&rb->overwrite_pos);
default:
return 0;
}
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index a00561b1d3e5..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -89,15 +89,14 @@ struct rqspinlock_timeout {
DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
-static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
+static bool is_lock_released(rqspinlock_t *lock, u32 mask)
{
if (!(atomic_read_acquire(&lock->val) & (mask)))
return true;
return false;
}
-static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_AA(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int cnt = min(RES_NR_HELD, rqh->cnt);
@@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
* more locks, which reduce to ABBA). This is not exhaustive, and we rely on
* timeouts as the final line of defense.
*/
-static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
@@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
* Let's ensure to break out of this loop if the lock is available for
* us to potentially acquire.
*/
- if (is_lock_released(lock, mask, ts))
+ if (is_lock_released(lock, mask))
return 0;
/*
@@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
return 0;
}
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
-{
- int ret;
-
- ret = check_deadlock_AA(lock, mask, ts);
- if (ret)
- return ret;
- ret = check_deadlock_ABBA(lock, mask, ts);
- if (ret)
- return ret;
-
- return 0;
-}
-
static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
{
- u64 time = ktime_get_mono_fast_ns();
u64 prev = ts->cur;
+ u64 time;
if (!ts->timeout_end) {
- ts->cur = time;
- ts->timeout_end = time + ts->duration;
+ if (check_deadlock_AA(lock))
+ return -EDEADLK;
+ ts->cur = ktime_get_mono_fast_ns();
+ ts->timeout_end = ts->cur + ts->duration;
return 0;
}
+ time = ktime_get_mono_fast_ns();
if (time > ts->timeout_end)
return -ETIMEDOUT;
@@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
*/
if (prev + NSEC_PER_MSEC < time) {
ts->cur = time;
- return check_deadlock(lock, mask, ts);
+ return check_deadlock_ABBA(lock, mask);
}
return 0;
@@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
int val, ret = 0;
RES_INIT_TIMEOUT(ts);
+ /*
+ * The fast path is not invoked for the TAS fallback, so we must grab
+ * the deadlock detection entry here.
+ */
grab_held_lock_entry(lock);
/*
@@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
goto queue;
}
- /*
- * Grab an entry in the held locks array, to enable deadlock detection.
- */
- grab_held_lock_entry(lock);
+ /* Deadlock detection entry already held after failing fast path. */
/*
* We're pending, wait for the owner to go away.
@@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* queuing.
*/
queue:
- lockevent_inc(lock_slowpath);
/*
- * Grab deadlock detection entry for the queue path.
+ * Do not queue if we're a waiter and someone is attempting this lock on
+ * the same CPU. In case of NMIs, this prevents long timeouts where we
+ * interrupt the pending waiter, and the owner, that will eventually
+ * signal the head of our queue, both of which are logically but not
+ * physically part of the queue, hence outside the scope of the idx > 0
+ * check above for the trylock fallback.
*/
- grab_held_lock_entry(lock);
+ if (check_deadlock_AA(lock)) {
+ ret = -EDEADLK;
+ goto err_release_entry;
+ }
+ lockevent_inc(lock_slowpath);
+ /* Deadlock detection entry already held after failing fast path. */
node = this_cpu_ptr(&rqnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -467,19 +463,17 @@ queue:
* not be nested NMIs taking spinlocks. That may not be true in
* some architectures even though the chance of needing more than
* 4 nodes will still be extremely unlikely. When that happens,
- * we fall back to spinning on the lock directly without using
- * any MCS node. This is not the most elegant solution, but is
- * simple enough.
+ * we fall back to attempting a trylock operation without using
+ * any MCS node. Unlike qspinlock which cannot fail, we have the
+ * option of failing the slow path, and under contention, such a
+ * trylock spinning will likely be treated unfairly due to lack of
+ * queueing, hence do not spin.
*/
- if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
lockevent_inc(lock_no_node);
- RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
- while (!queued_spin_trylock(lock)) {
- if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
- lockevent_inc(rqspinlock_lock_timeout);
- goto err_release_node;
- }
- cpu_relax();
+ if (!queued_spin_trylock(lock)) {
+ ret = -EDEADLK;
+ goto err_release_node;
}
goto release;
}
@@ -540,7 +534,7 @@ queue:
val = arch_mcs_spin_lock_contended(&node->locked);
if (val == RES_TIMEOUT_VAL) {
- ret = -EDEADLK;
+ ret = -ETIMEDOUT;
goto waitq_timeout;
}
@@ -575,6 +569,14 @@ queue:
val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+ /* Disable queue destruction when we detect deadlocks. */
+ if (ret == -EDEADLK) {
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+ arch_mcs_spin_unlock_contended(&next->locked);
+ goto err_release_node;
+ }
+
waitq_timeout:
if (ret) {
/*
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8f1dacaf01fe..da3d328f5c15 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map)
sizeof(struct bpf_stack_build_id) : sizeof(u64);
}
+/**
+ * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
+ * @size: Size of the buffer/map value in bytes
+ * @elem_size: Size of each stack trace element
+ * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
+ *
+ * Return: Maximum number of stack trace entries that can be safely stored
+ */
+static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
+{
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 max_depth;
+ u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
+
+ max_depth = size / elem_size;
+ max_depth += skip;
+ if (max_depth > curr_sysctl_max_stack)
+ return curr_sysctl_max_stack;
+
+ return max_depth;
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u64 elem_size = sizeof(struct stack_map_bucket) +
@@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 hash, id, trace_nr, trace_len, i, max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr = trace->nr - skip;
+ max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
+ trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
trace_len = trace_nr * sizeof(u64);
ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
@@ -300,20 +323,17 @@ static long __bpf_get_stackid(struct bpf_map *map,
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
- u32 max_depth = map->value_size / stack_map_data_size(map);
- u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 elem_size = stack_map_data_size(map);
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
+ u32 max_depth;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- max_depth += skip;
- if (max_depth > sysctl_perf_event_max_stack)
- max_depth = sysctl_perf_event_max_stack;
-
+ max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false, 0);
@@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
return -EFAULT;
nr_kernel = count_kernel_ip(trace);
+ __u64 nr = trace->nr; /* save original */
if (kernel) {
- __u64 nr = trace->nr;
-
trace->nr = nr_kernel;
ret = __bpf_get_stackid(map, trace, flags);
-
- /* restore nr */
- trace->nr = nr;
} else { /* user */
u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
ret = __bpf_get_stackid(map, trace, flags);
}
+
+ /* restore nr */
+ trace->nr = nr;
+
return ret;
}
@@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags, bool may_fault)
{
- u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
+ u32 trace_nr, copy_len, elem_size, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto clear;
}
- num_elem = size / elem_size;
- max_depth = num_elem + skip;
- if (sysctl_perf_event_max_stack < max_depth)
- max_depth = sysctl_perf_event_max_stack;
+ max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
if (may_fault)
rcu_read_lock(); /* need RCU for perf's callchain below */
- if (trace_in)
+ if (trace_in) {
trace = trace_in;
- else if (kernel && task)
+ trace->nr = min_t(u32, trace->nr, max_depth);
+ } else if (kernel && task) {
trace = get_callchain_entry_for_task(task, max_depth);
- else
+ } else {
trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false, 0);
+ }
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
@@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
}
trace_nr = trace->nr - skip;
- trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index ff16c631951b..0b6bc3f30335 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -4,111 +4,10 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
-#include <linux/percpu.h>
-#include <linux/refcount.h>
#include <linux/gfp.h>
#include <linux/memory.h>
-#include <linux/local_lock.h>
#include <linux/mutex.h>
-/*
- * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
- * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
- * stash it in a local per-CPU variable, and bump allocate from the page
- * whenever items need to be printed to a stream. Each page holds a global
- * atomic refcount in its first 4 bytes, and then records of variable length
- * that describe the printed messages. Once the global refcount has dropped to
- * zero, it is a signal to free the page back to the kernel's page allocator,
- * given all the individual records in it have been consumed.
- *
- * It is possible the same page is used to serve allocations across different
- * programs, which may be consumed at different times individually, hence
- * maintaining a reference count per-page is critical for correct lifetime
- * tracking.
- *
- * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
- * lands.
- */
-struct bpf_stream_page {
- refcount_t ref;
- u32 consumed;
- char buf[];
-};
-
-/* Available room to add data to a refcounted page. */
-#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
-
-static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
-static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
-
-static bool bpf_stream_page_local_lock(unsigned long *flags)
-{
- return local_trylock_irqsave(&stream_local_lock, *flags);
-}
-
-static void bpf_stream_page_local_unlock(unsigned long *flags)
-{
- local_unlock_irqrestore(&stream_local_lock, *flags);
-}
-
-static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
-{
- struct page *p;
-
- if (!stream_page)
- return;
- p = virt_to_page(stream_page);
- free_pages_nolock(p, 0);
-}
-
-static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
-{
- refcount_inc(&stream_page->ref);
-}
-
-static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
-{
- if (refcount_dec_and_test(&stream_page->ref))
- bpf_stream_page_free(stream_page);
-}
-
-static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
-{
- refcount_set(&stream_page->ref, 1);
- stream_page->consumed = 0;
-}
-
-static struct bpf_stream_page *bpf_stream_page_replace(void)
-{
- struct bpf_stream_page *stream_page, *old_stream_page;
- struct page *page;
-
- page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
- if (!page)
- return NULL;
- stream_page = page_address(page);
- bpf_stream_page_init(stream_page);
-
- old_stream_page = this_cpu_read(stream_pcpu_page);
- if (old_stream_page)
- bpf_stream_page_put(old_stream_page);
- this_cpu_write(stream_pcpu_page, stream_page);
- return stream_page;
-}
-
-static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
-{
- int min = offsetof(struct bpf_stream_elem, str[0]);
- int consumed = stream_page->consumed;
- int total = BPF_STREAM_PAGE_SZ;
- int rem = max(0, total - consumed - min);
-
- /* Let's give room of at least 8 bytes. */
- WARN_ON_ONCE(rem % 8 != 0);
- rem = rem < 8 ? 0 : rem;
- return min(len, rem);
-}
-
static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
{
init_llist_node(&elem->node);
@@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
elem->consumed_len = 0;
}
-static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
-{
- unsigned long addr = (unsigned long)elem;
-
- return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
-}
-
-static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
-{
- u32 consumed = stream_page->consumed;
-
- stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
- return (struct bpf_stream_elem *)&stream_page->buf[consumed];
-}
-
-static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
-{
- struct bpf_stream_elem *elem = NULL;
- struct bpf_stream_page *page;
- int room = 0;
-
- page = this_cpu_read(stream_pcpu_page);
- if (!page)
- page = bpf_stream_page_replace();
- if (!page)
- return NULL;
-
- room = bpf_stream_page_check_room(page, len);
- if (room != len)
- page = bpf_stream_page_replace();
- if (!page)
- return NULL;
- bpf_stream_page_get(page);
- room = bpf_stream_page_check_room(page, len);
- WARN_ON_ONCE(room != len);
-
- elem = bpf_stream_page_push_elem(page, room);
- bpf_stream_elem_init(elem, room);
- return elem;
-}
-
static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
{
const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
struct bpf_stream_elem *elem;
- unsigned long flags;
+ size_t alloc_size;
- BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
/*
* Length denotes the amount of data to be written as part of stream element,
* thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
@@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
if (len < 0 || len > max_len)
return NULL;
- if (!bpf_stream_page_local_lock(&flags))
+ alloc_size = offsetof(struct bpf_stream_elem, str[len]);
+ elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
+ if (!elem)
return NULL;
- elem = bpf_stream_page_reserve_elem(len);
- bpf_stream_page_local_unlock(&flags);
+
+ bpf_stream_elem_init(elem, len);
+
return elem;
}
@@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp
static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
{
- struct bpf_stream_page *p;
-
- p = bpf_stream_page_from_elem(elem);
- bpf_stream_page_put(p);
+ kfree_nolock(elem);
}
static void bpf_stream_free_list(struct llist_node *list)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6cde6a46babf..4ff82144f885 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
static void unpin_uptr_kaddr(void *kaddr)
@@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr
if (map->ops->map_get_unmapped_area)
return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
- return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
#else
return addr;
#endif
@@ -1234,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
return src - orig_src;
}
+EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
int map_check_no_btf(const struct bpf_map *map,
const struct btf *btf,
@@ -1493,6 +1494,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
if (!bpf_token_capable(token, CAP_BPF))
goto put_token;
break;
@@ -1585,7 +1587,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
- return -EINVAL;
+ err = -EINVAL;
+ goto free_map;
}
err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
@@ -1724,9 +1727,6 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
- if (attr->flags & ~BPF_F_LOCK)
- return -EINVAL;
-
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1734,9 +1734,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
- if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK))
- return -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ if (err)
+ return err;
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key))
@@ -1799,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- err = -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->flags, ~0);
+ if (err)
goto err_put;
- }
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -2007,13 +2005,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- return -EINVAL;
- }
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -2070,12 +2064,9 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK))
- return -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -2462,6 +2453,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
struct bpf_prog_stats *stats;
unsigned int flags;
+ if (unlikely(!prog->stats))
+ return;
+
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
@@ -2853,6 +2847,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
return err;
}
+static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
+ continue;
+
+ err = bpf_insn_array_ready(prog->aux->used_maps[i]);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
@@ -3082,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_used_maps;
+ err = bpf_prog_mark_insn_arrays_ready(prog);
+ if (err < 0)
+ goto free_used_maps;
+
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
@@ -5034,19 +5049,19 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_insn *insns_sanitized;
bool fault;
- if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
+ if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
+ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
+ if (!insns_sanitized)
+ return -ENOMEM;
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ fault = copy_to_user(uinsns, insns_sanitized, ulen);
+ kfree(insns_sanitized);
+ if (fault)
+ return -EFAULT;
+ } else {
info.xlated_prog_insns = 0;
- goto done;
}
- insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
- if (!insns_sanitized)
- return -ENOMEM;
- uinsns = u64_to_user_ptr(info.xlated_prog_insns);
- ulen = min_t(u32, info.xlated_prog_len, ulen);
- fault = copy_to_user(uinsns, insns_sanitized, ulen);
- kfree(insns_sanitized);
- if (fault)
- return -EFAULT;
}
if (bpf_prog_is_offloaded(prog->aux)) {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f2cb0b097093..976d89011b15 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -175,23 +175,42 @@ out:
return tr;
}
-static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr)
{
+ enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL;
void *ip = tr->func.addr;
+
+ if (!new_addr)
+ new_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(tr->flags))
+ new_t = BPF_MOD_JUMP;
+
+ if (!old_addr)
+ old_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(orig_flags))
+ old_t = BPF_MOD_JUMP;
+
+ return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr)
+{
int ret;
if (tr->func.ftrace_managed)
ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr,
bool lock_direct_mutex)
{
- void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed) {
@@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
else
ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
+ new_addr);
}
return ret;
}
@@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
- ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ if (ret)
+ return ret;
ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
return ret;
@@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
* call_rcu_tasks() is not necessary.
*/
if (im->ip_after_call) {
- int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
- NULL, im->ip_epilogue);
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ im->ip_epilogue);
WARN_ON(err);
if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
@@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
return PTR_ERR(tlinks);
if (total == 0) {
- err = unregister_fentry(tr, tr->cur_image->image);
+ err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
goto out;
@@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
again:
- if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
- (tr->flags & BPF_TRAMP_F_CALL_ORIG))
- tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+ if (tr->flags & BPF_TRAMP_F_CALL_ORIG) {
+ if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) {
+ /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the
+ * first try, reset it in the second try.
+ */
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME;
+ } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) {
+ /* Use "jmp" instead of "call" for the trampoline
+ * in the origin call case, and we don't need to
+ * skip the frame.
+ */
+ tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME;
+ }
+ }
#endif
size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
@@ -465,10 +499,18 @@ again:
if (err)
goto out_free;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
+ err = modify_fentry(tr, orig_flags, tr->cur_image->image,
+ im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
@@ -491,8 +533,15 @@ again:
tr->cur_image = im;
out:
/* If any error happens, restore previous flags */
- if (err)
+ if (err) {
tr->flags = orig_flags;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+ }
kfree(tlinks);
return err;
@@ -568,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
if (err)
return err;
tr->extension_prog = link->link.prog;
- return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
}
if (cnt >= BPF_MAX_TRAMP_LINKS)
@@ -616,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ BPF_MOD_NOP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
guard(mutex)(&tgt_prog->aux->ext_mutex);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbe4bb91c564..f0ca69f888fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -209,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-static void specialize_kfunc(struct bpf_verifier_env *env,
- u32 func_id, u16 offset, unsigned long *addr);
static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
@@ -515,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_task_work_add_kfunc(u32 func_id);
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
@@ -547,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn)
(bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}
+static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ /* bpf_timer callbacks are never sleepable. */
+ if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback)
+ return false;
+
+ /* bpf_wq and bpf_task_work callbacks are always sleepable. */
+ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ return true;
+
+ verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
+ return false;
+}
+
static bool is_may_goto_insn(struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
@@ -676,6 +690,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_XDP;
case DYNPTR_TYPE_SKB_META:
return BPF_DYNPTR_TYPE_SKB_META;
+ case DYNPTR_TYPE_FILE:
+ return BPF_DYNPTR_TYPE_FILE;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -694,6 +710,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_XDP;
case BPF_DYNPTR_TYPE_SKB_META:
return DYNPTR_TYPE_SKB_META;
+ case BPF_DYNPTR_TYPE_FILE:
+ return DYNPTR_TYPE_FILE;
default:
return 0;
}
@@ -701,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -812,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
struct bpf_func_state *state = func(env, reg);
int spi, ref_obj_id, i;
+ /*
+ * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
+ return -EFAULT;
+ }
spi = dynptr_get_spi(env, reg);
if (spi < 0)
return spi;
@@ -1410,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf
dst->acquired_refs = src->acquired_refs;
dst->active_locks = src->active_locks;
dst->active_preempt_locks = src->active_preempt_locks;
- dst->active_rcu_lock = src->active_rcu_lock;
+ dst->active_rcu_locks = src->active_rcu_locks;
dst->active_irq_id = src->active_irq_id;
dst->active_lock_id = src->active_lock_id;
dst->active_lock_ptr = src->active_lock_ptr;
@@ -2093,7 +2120,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2103,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
if (err)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->st.speculative |= speculative;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex.\n",
env->stack_size);
- return NULL;
+ return ERR_PTR(-E2BIG);
}
if (elem->st.parent) {
++elem->st.parent->branches;
@@ -2903,7 +2930,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2915,7 +2942,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
verbose(env,
"The sequence of %d jumps is too complex for async cb.\n",
env->stack_size);
- return NULL;
+ return ERR_PTR(-E2BIG);
}
/* Unlike push_stack() do not copy_verifier_state().
* The caller state doesn't matter.
@@ -2926,7 +2953,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem->st.in_sleepable = is_sleepable;
frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT);
if (!frame)
- return NULL;
+ return ERR_PTR(-ENOMEM);
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
@@ -3097,6 +3124,9 @@ struct bpf_kfunc_btf_tab {
u32 nr_descs;
};
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
+ int insn_idx);
+
static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
@@ -3114,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b)
return d0->offset - d1->offset;
}
-static const struct bpf_kfunc_desc *
+static struct bpf_kfunc_desc *
find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
struct bpf_kfunc_desc desc = {
@@ -3237,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
const struct btf_type *func, *func_proto;
struct bpf_kfunc_btf_tab *btf_tab;
+ struct btf_func_model func_model;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
struct bpf_kfunc_desc *desc;
const char *func_name;
struct btf *desc_btf;
- unsigned long call_imm;
unsigned long addr;
int err;
@@ -3326,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
func_name);
return -EINVAL;
}
- specialize_kfunc(env, func_id, offset, &addr);
-
- if (bpf_jit_supports_far_kfunc_call()) {
- call_imm = func_id;
- } else {
- call_imm = BPF_CALL_IMM(addr);
- /* Check whether the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel function %s is out of range\n",
- func_name);
- return -EINVAL;
- }
- }
if (bpf_dev_bound_kfunc_id(func_id)) {
err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
@@ -3346,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
+ err = btf_distill_func_proto(&env->log, desc_btf,
+ func_proto, func_name,
+ &func_model);
+ if (err)
+ return err;
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = call_imm;
desc->offset = offset;
desc->addr = addr;
- err = btf_distill_func_proto(&env->log, desc_btf,
- func_proto, func_name,
- &desc->func_model);
- if (!err)
- sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_id_off, NULL);
- return err;
+ desc->func_model = func_model;
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id_off, NULL);
+ return 0;
}
static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
@@ -3372,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
return 0;
}
-static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
{
struct bpf_kfunc_desc_tab *tab;
+ int i, err;
- tab = prog->aux->kfunc_tab;
+ tab = env->prog->aux->kfunc_tab;
if (!tab)
- return;
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -3509,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env)
subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
- if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+ if (BPF_OP(code) == BPF_CALL)
goto next;
+ if (BPF_OP(code) == BPF_EXIT) {
+ subprog[cur_subprog].exit_idx = i;
+ goto next;
+ }
off = i + bpf_jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
@@ -4392,6 +4442,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
bt_reg_mask(bt));
return -EFAULT;
}
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
} else if (opcode == BPF_EXIT) {
bool r0_precise;
@@ -5826,8 +5881,7 @@ bad_type:
static bool in_sleepable(struct bpf_verifier_env *env)
{
- return env->prog->sleepable ||
- (env->cur_state && env->cur_state->in_sleepable);
+ return env->cur_state->in_sleepable;
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -5835,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env)
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
- return env->cur_state->active_rcu_lock ||
+ return env->cur_state->active_rcu_locks ||
env->cur_state->active_locks ||
!in_sleepable(env);
}
@@ -5988,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+/*
+ * Return the size of the memory region accessible from a pointer to map value.
+ * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
+ */
+static u32 map_mem_size(const struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ return map->max_entries * sizeof(long);
+
+ return map->value_size;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size, bool zero_size_allowed,
@@ -5997,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
+ u32 mem_size = map_mem_size(map);
struct btf_record *rec;
int err, i;
- err = check_mem_region_access(env, regno, off, size, map->value_size,
- zero_size_allowed);
+ err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
if (err)
return err;
@@ -6416,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
+ if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ strict = true;
break;
case PTR_TO_CTX:
pointer_desc = "context ";
@@ -7039,6 +7107,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
+#ifdef CONFIG_MEMCG
+ struct task_struct __rcu *owner;
+#endif
};
/* skb->sk, req->sk are not RCU protected, but we mark them as such
@@ -7078,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
+ struct mm_struct *vm_mm;
+ struct file *vm_file;
+};
+
static bool type_is_rcu(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
const char *field_name, u32 btf_id)
@@ -7119,6 +7195,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
@@ -7502,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
- /* if map is read-only, track its contents as scalars */
+ /*
+ * If map is read-only, track its contents as scalars,
+ * unless it is an insn array (see the special case below)
+ */
if (tnum_is_const(reg->var_off) &&
bpf_map_is_rdonly(map) &&
- map->ops->map_direct_value_addr) {
+ map->ops->map_direct_value_addr &&
+ map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
int map_off = off + reg->var_off.value;
u64 val = 0;
@@ -7516,6 +7597,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
regs[value_regno].type = SCALAR_VALUE;
__mark_reg_known(&regs[value_regno], val);
+ } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ if (bpf_size != BPF_DW) {
+ verbose(env, "Invalid read of %d bytes from insn_array\n",
+ size);
+ return -EACCES;
+ }
+ copy_register_state(&regs[value_regno], reg);
+ regs[value_regno].type = PTR_TO_INSN;
} else {
mark_reg_unknown(env, regs, value_regno);
}
@@ -8464,6 +8553,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
case BPF_TASK_WORK:
field_off = map->record->task_work_off;
break;
+ case BPF_WORKQUEUE:
+ field_off = map->record->wq_off;
+ break;
default:
verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
@@ -8505,13 +8597,17 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno,
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_map *map = reg->map_ptr;
- u64 val = reg->var_off.value;
+ int err;
- if (map->record->wq_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
- val + reg->off, map->record->wq_off);
- return -EINVAL;
+ err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_wq helper");
+ return -EFAULT;
}
+
meta->map.uid = reg->map_uid;
meta->map.ptr = map;
return 0;
@@ -9016,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
/* branch out active iter state */
queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
- if (!queued_st)
- return -ENOMEM;
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
queued_iter = get_iter_from_state(queued_st, meta);
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
@@ -10054,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ goto error;
default:
break;
}
@@ -10368,8 +10466,6 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
-static bool is_task_work_add_kfunc(u32 func_id);
-
static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
@@ -10588,10 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog,
- is_bpf_wq_set_callback_impl_kfunc(insn->imm) ||
- is_task_work_add_kfunc(insn->imm));
- if (!async_cb)
- return -EFAULT;
+ is_async_cb_sleepable(env, insn));
+ if (IS_ERR(async_cb))
+ return PTR_ERR(async_cb);
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
@@ -10607,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
* proceed with next instruction within current frame.
*/
callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
- if (!callback_state)
- return -ENOMEM;
+ if (IS_ERR(callback_state))
+ return PTR_ERR(callback_state);
err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
callback_state);
@@ -10648,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (env->subprog_info[subprog].might_sleep &&
- (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
+ (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
env->cur_state->active_irq_id || !in_sleepable(env))) {
verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
"i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
@@ -10662,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
- verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
- subprog, sub_name);
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
if (env->subprog_info[subprog].changes_pkt_data)
clear_all_pkt_pointers(env);
/* mark global subprog for verifying after main prog */
@@ -10976,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
bool in_callback_fn;
int err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
+
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0];
if (r0->type == PTR_TO_STACK) {
@@ -11226,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
return -EINVAL;
}
- if (check_lock && env->cur_state->active_rcu_lock) {
+ if (check_lock && env->cur_state->active_rcu_locks) {
verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
return -EINVAL;
}
@@ -11361,6 +11461,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
return *ptr && (*ptr)->func ? 0 : -EINVAL;
}
+/* Check if we're in a sleepable context. */
+static inline bool in_sleepable_context(struct bpf_verifier_env *env)
+{
+ return !env->cur_state->active_rcu_locks &&
+ !env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_irq_id &&
+ in_sleepable(env);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11421,15 +11530,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (env->cur_state->active_rcu_lock) {
+ if (env->cur_state->active_rcu_locks) {
if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_preempt_locks) {
@@ -11438,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_irq_id) {
@@ -11449,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
+ /* Track non-sleepable context for helpers. */
+ if (!in_sleepable_context(env))
+ env->insn_aux_data[insn_idx].non_sleepable = true;
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -11484,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.release_regno) {
err = -EINVAL;
- /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
- * is safe to do directly.
- */
if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
- if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
- verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
- return -EFAULT;
- }
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
} else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
u32 ref_obj_id = meta.ref_obj_id;
@@ -11886,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->prog->call_get_func_ip = true;
}
+ if (func_id == BPF_FUNC_tail_call) {
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ clear_all_pkt_pointers(env);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ } else {
+ changes_data = false;
+ }
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -12260,6 +12375,8 @@ enum special_kfunc_type {
KF_bpf_res_spin_unlock,
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
+ KF_bpf_dynptr_from_file,
+ KF_bpf_dynptr_file_discard,
KF___bpf_trap,
KF_bpf_task_work_schedule_signal_impl,
KF_bpf_task_work_schedule_resume_impl,
@@ -12332,6 +12449,8 @@ BTF_ID(func, bpf_res_spin_lock)
BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, bpf_dynptr_from_file)
+BTF_ID(func, bpf_dynptr_file_discard)
BTF_ID(func, __bpf_trap)
BTF_ID(func, bpf_task_work_schedule_signal_impl)
BTF_ID(func, bpf_task_work_schedule_resume_impl)
@@ -13295,6 +13414,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
dynptr_arg_type |= DYNPTR_TYPE_XDP;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ meta->release_regno = regno;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
(dynptr_arg_type & MEM_UNINIT)) {
enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
@@ -13829,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_reg_state *regs;
branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
- if (!branch) {
+ if (IS_ERR(branch)) {
verbose(env, "failed to push state for failed lock acquisition\n");
- return -ENOMEM;
+ return PTR_ERR(branch);
}
regs = branch->frame[branch->curframe]->regs;
@@ -13863,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ /* Track non-sleepable context for kfuncs, same as for helpers. */
+ if (!in_sleepable_context(env))
+ insn_aux->non_sleepable = true;
+
/* Check the arguments */
err = check_kfunc_args(env, &meta, insn_idx);
if (err < 0)
@@ -13909,36 +14037,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
- if (env->cur_state->active_rcu_lock) {
+ if (rcu_lock) {
+ env->cur_state->active_rcu_locks++;
+ } else if (rcu_unlock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
- if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
- verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
- return -EACCES;
- }
-
- if (rcu_lock) {
- verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ if (env->cur_state->active_rcu_locks == 0) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
return -EINVAL;
- } else if (rcu_unlock) {
+ }
+ if (--env->cur_state->active_rcu_locks == 0) {
bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
if (reg->type & MEM_RCU) {
reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
reg->type |= PTR_UNTRUSTED;
}
}));
- env->cur_state->active_rcu_lock = false;
- } else if (sleepable) {
- verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
- return -EACCES;
}
- } else if (rcu_lock) {
- env->cur_state->active_rcu_lock = true;
- } else if (rcu_unlock) {
- verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
- return -EINVAL;
+ } else if (sleepable && env->cur_state->active_rcu_locks) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
}
if (env->cur_state->active_preempt_locks) {
@@ -13971,12 +14096,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
- if (err) {
- verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, meta.func_id);
- return err;
+ struct bpf_reg_state *reg = &regs[meta.release_regno];
+
+ if (meta.initialized_dynptr.ref_obj_id) {
+ err = unmark_stack_slots_dynptr(env, reg);
+ } else {
+ err = release_reference(env, reg->ref_obj_id);
+ if (err)
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
}
+ if (err)
+ return err;
}
if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
@@ -14282,16 +14413,15 @@ struct bpf_sanitize_info {
bool mask_to_left;
};
-static struct bpf_verifier_state *
-sanitize_speculative_path(struct bpf_verifier_env *env,
- const struct bpf_insn *insn,
- u32 next_idx, u32 curr_idx)
+static int sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
{
struct bpf_verifier_state *branch;
struct bpf_reg_state *regs;
branch = push_stack(env, next_idx, curr_idx, true);
- if (branch && insn) {
+ if (!IS_ERR(branch) && insn) {
regs = branch->frame[branch->curframe]->regs;
if (BPF_SRC(insn->code) == BPF_K) {
mark_reg_unknown(env, regs, insn->dst_reg);
@@ -14300,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env,
mark_reg_unknown(env, regs, insn->src_reg);
}
}
- return branch;
+ return PTR_ERR_OR_ZERO(branch);
}
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
@@ -14319,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
- bool ret;
int err;
if (can_skip_alu_sanitation(env, insn))
@@ -14392,11 +14521,12 @@ do_sim:
tmp = *dst_reg;
copy_register_state(dst_reg, ptr_reg);
}
- ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
- env->insn_idx);
- if (!ptr_is_dst_reg && ret)
+ err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
+ if (err < 0)
+ return REASON_STACK;
+ if (!ptr_is_dst_reg)
*dst_reg = tmp;
- return !ret ? REASON_STACK : 0;
+ return 0;
}
static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
@@ -15950,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+ if (reg1 == reg2) {
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JLE:
+ case BPF_JSGE:
+ case BPF_JSLE:
+ case BPF_JEQ:
+ return 1;
+ case BPF_JGT:
+ case BPF_JLT:
+ case BPF_JSGT:
+ case BPF_JSLT:
+ case BPF_JNE:
+ return 0;
+ case BPF_JSET:
+ if (tnum_is_const(t1))
+ return t1.value != 0;
+ else
+ return (smin1 <= 0 && smax1 >= 0) ? -1 : 1;
+ default:
+ return -1;
+ }
+ }
+
switch (opcode) {
case BPF_JEQ:
/* constants, umin/umax and smin/smax checks would be
@@ -16396,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env,
if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
return 0;
+ /* We compute branch direction for same SCALAR_VALUE registers in
+ * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
+ * on the same registers, we don't need to adjust the min/max values.
+ */
+ if (false_reg1 == false_reg2)
+ return 0;
+
/* fallthrough (FALSE) branch */
regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
reg_bounds_sync(false_reg1);
@@ -16716,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* branch out 'fallthrough' insn as a new state to explore */
queued_st = push_stack(env, idx + 1, idx, false);
- if (!queued_st)
- return -ENOMEM;
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
queued_st->may_goto_depth++;
if (prev_st)
@@ -16795,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* the fall-through branch for simulation under speculative
* execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn, *insn_idx + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx);
+ if (err < 0)
+ return err;
+ }
if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
@@ -16808,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* program will go. If needed, push the goto branch for
* simulation under speculative execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn,
- *insn_idx + insn->off + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1,
+ *insn_idx);
+ if (err < 0)
+ return err;
+ }
if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe);
return 0;
@@ -16833,10 +16996,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
- false);
- if (!other_branch)
- return -EFAULT;
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
if (BPF_SRC(insn->code) == BPF_X) {
@@ -17019,7 +17181,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- WARN_ON_ONCE(map->max_entries != 1);
+ WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
+ map->max_entries != 1);
/* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
@@ -17771,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env)
return 0;
}
+static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ static struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
+{
+ static struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -17831,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_subprog_might_sleep(env, t);
if (bpf_helper_changes_pkt_data(insn->imm))
mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call)
+ visit_tailcall_insn(env, t);
} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -17863,8 +18269,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insn->code) != BPF_K)
- return -EINVAL;
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
if (BPF_CLASS(insn->code) == BPF_JMP)
off = insn->off;
@@ -17991,8 +18397,9 @@ err_free:
*/
static int compute_postorder(struct bpf_verifier_env *env)
{
- u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2];
+ u32 cur_postorder, i, top, stack_sz, s;
int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
@@ -18016,11 +18423,11 @@ static int compute_postorder(struct bpf_verifier_env *env)
stack_sz--;
continue;
}
- succ_cnt = bpf_insn_successors(env->prog, top, succ);
- for (s = 0; s < succ_cnt; ++s) {
- if (!state[succ[s]]) {
- stack[stack_sz++] = succ[s];
- state[succ[s]] |= DISCOVERED;
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
}
}
state[top] |= EXPLORED;
@@ -18792,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
case PTR_TO_ARENA:
return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ rold->off == rcur->off && range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
default:
return regs_exact(rold, rcur, idmap);
}
@@ -18972,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
if (old->active_preempt_locks != cur->active_preempt_locks)
return false;
- if (old->active_rcu_lock != cur->active_rcu_lock)
+ if (old->active_rcu_locks != cur->active_rcu_locks)
return false;
if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
@@ -19144,7 +19555,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
bt_set_frame_slot(&env->bt, fr, i);
first = false;
}
- if (!first)
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
verbose(env, "\n");
}
@@ -19784,9 +20195,6 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
if (env->cur_state->curframe) {
- err = bpf_update_live_stack(env);
- if (err)
- return err;
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
if (err)
@@ -19801,6 +20209,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
}
+static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
+ int regno,
+ struct bpf_map *map,
+ u32 *pmin_index, u32 *pmax_index)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ u64 min_index, max_index;
+ const u32 size = 8;
+
+ if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
+ (min_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
+ regno, reg->umin_value, reg->off);
+ return -ERANGE;
+ }
+ if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
+ (max_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
+ regno, reg->umax_value, reg->off);
+ return -ERANGE;
+ }
+
+ min_index /= size;
+ max_index /= size;
+
+ if (max_index >= map->max_entries) {
+ verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
+ regno, min_index, max_index, map->max_entries);
+ return -EINVAL;
+ }
+
+ *pmin_index = min_index;
+ *pmax_index = max_index;
+ return 0;
+}
+
+/* gotox *dst_reg */
+static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *dst_reg;
+ struct bpf_map *map;
+ u32 min_index, max_index;
+ int err = 0;
+ int n;
+ int i;
+
+ dst_reg = reg_state(env, insn->dst_reg);
+ if (dst_reg->type != PTR_TO_INSN) {
+ verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
+ insn->dst_reg, reg_type_str(env, dst_reg->type));
+ return -EINVAL;
+ }
+
+ map = dst_reg->map_ptr;
+ if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
+ return -EFAULT;
+
+ if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
+ "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
+ return -EFAULT;
+
+ err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
+ if (err)
+ return err;
+
+ /* Ensure that the buffer is large enough */
+ if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
+ env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
+ if (!env->gotox_tmp_buf)
+ return -ENOMEM;
+ }
+
+ n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ if (n < 0)
+ return n;
+ if (n == 0) {
+ verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
+ insn->dst_reg, map->id);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < n - 1; i++) {
+ other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
+ env->insn_idx, env->cur_state->speculative);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
+ }
+ env->insn_idx = env->gotox_tmp_buf->items[n-1];
+ return 0;
+}
+
static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
{
int err;
@@ -19903,6 +20404,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 ||
+ insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ return check_indirect_jump(env, insn);
+ }
+
if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
@@ -20419,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_QUEUE:
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
break;
default:
verbose(env,
@@ -20490,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
env->used_maps[env->used_map_cnt++] = map;
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ err = bpf_insn_array_init(map, env->prog);
+ if (err) {
+ verbose(env, "Failed to properly initialize insn array\n");
+ return err;
+ }
+ env->insn_array_maps[env->insn_array_map_cnt++] = map;
+ }
+
return env->used_map_cnt - 1;
}
@@ -20736,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
+static void release_insn_arrays(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_release(env->insn_array_maps[i]);
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
@@ -20777,6 +21324,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
}
adjust_insn_aux_data(env, new_prog, off, len);
adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
}
@@ -20939,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
return 0;
}
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
@@ -20948,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ clear_insn_aux_data(env, off, cnt);
+
err = bpf_remove_insns(env->prog, off, cnt);
if (err)
return err;
@@ -20960,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
if (err)
return err;
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
memmove(aux_data + off, aux_data + off + cnt,
sizeof(*aux_data) * (orig_prog_len - off - cnt));
@@ -21499,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
struct bpf_insn *insn;
void *old_bpf_func;
int err, num_exentries;
+ int old_len, subprog_start_adjustment = 0;
if (env->subprog_cnt <= 1)
return 0;
@@ -21573,6 +22148,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
func[i]->aux->func_info = prog->aux->func_info;
func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
@@ -21602,6 +22178,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
@@ -21626,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
+
+ /*
+ * To properly pass the absolute subprog start to jit
+ * all instruction adjustments should be accumulated
+ */
+ old_len = func[i]->len;
func[i] = bpf_int_jit_compile(func[i]);
+ subprog_start_adjustment += func[i]->len - old_len;
+
if (!func[i]->jited) {
err = -ENOTSUPP;
goto out_free;
@@ -21679,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
cond_resched();
}
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
/* finally lock prog and jit images for all functions and
* populate kallsysm. Begin at the first subprogram, since
* bpf_prog_load will add the kallsyms for the main program.
@@ -21808,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env)
}
/* replace a generic kfunc with a specialized version if necessary */
-static void specialize_kfunc(struct bpf_verifier_env *env,
- u32 func_id, u16 offset, unsigned long *addr)
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
{
struct bpf_prog *prog = env->prog;
bool seen_direct_write;
void *xdp_kfunc;
bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
+
+ if (offset) /* return if module BTF is used */
+ return 0;
if (bpf_dev_bound_kfunc_id(func_id)) {
xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
- if (xdp_kfunc) {
- *addr = (unsigned long)xdp_kfunc;
- return;
- }
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
/* fallback to default kfunc when not supported by netdev */
- }
-
- if (offset)
- return;
-
- if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
seen_direct_write = env->seen_direct_write;
is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
if (is_rdonly)
- *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
/* restore env->seen_direct_write to its original value, since
* may_access_direct_pkt_data mutates it
*/
env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
}
-
- if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] &&
- bpf_lsm_has_d_inode_locked(prog))
- *addr = (unsigned long)bpf_set_dentry_xattr_locked;
-
- if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] &&
- bpf_lsm_has_d_inode_locked(prog))
- *addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ desc->addr = addr;
+ return 0;
}
static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
@@ -21870,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
- const struct bpf_kfunc_desc *desc;
+ struct bpf_kfunc_desc *desc;
+ int err;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -21890,6 +22487,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
if (!bpf_jit_supports_far_kfunc_call())
insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
@@ -22485,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
if (is_storage_get_function(insn->imm)) {
- if (!in_sleepable(env) ||
- env->insn_aux_data[i + delta].storage_get_func_atomic)
+ if (env->insn_aux_data[i + delta].non_sleepable)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
else
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
@@ -22919,7 +23519,9 @@ next_insn:
}
}
- sort_kfunc_descs_by_imm_off(env->prog);
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
return 0;
}
@@ -23156,6 +23758,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
state->curframe = 0;
state->speculative = false;
state->branches = 1;
+ state->in_sleepable = env->prog->sleepable;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT);
if (!state->frame[0]) {
kfree(state);
@@ -23175,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
struct bpf_subprog_arg_info *arg;
struct bpf_reg_state *reg;
- verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
ret = btf_prepare_func_args(env, subprog);
if (ret)
goto out;
@@ -24278,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env)
for (i = 0; i < env->cfg.cur_postorder; ++i) {
int insn_idx = env->cfg.insn_postorder[i];
struct insn_live_regs *live = &state[insn_idx];
- int succ_num;
- u32 succ[2];
+ struct bpf_iarray *succ;
u16 new_out = 0;
u16 new_in = 0;
- succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
- for (int s = 0; s < succ_num; ++s)
- new_out |= state[succ[s]].in;
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
new_in = (new_out & ~live->def) | live->use;
if (new_out != live->out || new_in != live->in) {
live->in = new_in;
@@ -24338,11 +24941,11 @@ static int compute_scc(struct bpf_verifier_env *env)
const u32 insn_cnt = env->prog->len;
int stack_sz, dfs_sz, err = 0;
u32 *stack, *pre, *low, *dfs;
- u32 succ_cnt, i, j, t, w;
+ u32 i, j, t, w;
u32 next_preorder_num;
u32 next_scc_id;
bool assign_scc;
- u32 succ[2];
+ struct bpf_iarray *succ;
next_preorder_num = 1;
next_scc_id = 1;
@@ -24449,12 +25052,12 @@ dfs_continue:
stack[stack_sz++] = w;
}
/* Visit 'w' successors */
- succ_cnt = bpf_insn_successors(env->prog, w, succ);
- for (j = 0; j < succ_cnt; ++j) {
- if (pre[succ[j]]) {
- low[w] = min(low[w], low[succ[j]]);
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
} else {
- dfs[dfs_sz++] = succ[j];
+ dfs[dfs_sz++] = succ->items[j];
goto dfs_continue;
}
}
@@ -24471,8 +25074,8 @@ dfs_continue:
* or if component has a self reference.
*/
assign_scc = stack[stack_sz - 1] != w;
- for (j = 0; j < succ_cnt; ++j) {
- if (succ[j] == w) {
+ for (j = 0; j < succ->cnt; ++j) {
+ if (succ->items[j] == w) {
assign_scc = true;
break;
}
@@ -24534,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
+ env->succ = iarray_realloc(NULL, 2);
+ if (!env->succ)
+ goto err_free_env;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
@@ -24757,6 +25363,8 @@ skip_full_check:
adjust_btf_func(env);
err_release_maps:
+ if (ret)
+ release_insn_arrays(env);
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_used_maps() will release them.
@@ -24777,11 +25385,14 @@ err_release_maps:
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
+ clear_insn_aux_data(env, 0, env->prog->len);
vfree(env->insn_aux_data);
err_free_env:
bpf_stack_liveness_free(env);
kvfree(env->cfg.insn_postorder);
kvfree(env->scc_info);
+ kvfree(env->succ);
+ kvfree(env->gotox_tmp_buf);
kvfree(env);
return ret;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ae1eb7a85eb4..e717208cfb18 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <linux/nstree.h>
+#include <linux/irq_work.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -287,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static void cgroup_rt_init(void);
#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS noinline
@@ -941,7 +943,8 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -4701,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
+EXPORT_SYMBOL_GPL(cgroup_file_notify);
/**
* cgroup_file_show - show or hide a hidden cgroup file
@@ -6354,6 +6358,7 @@ int __init cgroup_init(void)
BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
cgroup_lock();
@@ -6967,19 +6972,29 @@ void cgroup_post_fork(struct task_struct *child,
}
/**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+static void do_cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
@@ -6997,15 +7012,61 @@ void cgroup_exit(struct task_struct *tsk)
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
+}
+
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ per_cpu(cgrp_dead_tasks_iwork, cpu) =
+ IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+ }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
-void cgroup_release(struct task_struct *task)
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
@@ -7013,6 +7074,11 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+}
+
+void cgroup_task_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
if (!list_empty(&task->cg_list)) {
spin_lock_irq(&css_set_lock);
@@ -7020,11 +7086,7 @@ void cgroup_release(struct task_struct *task)
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
-}
-void cgroup_free(struct task_struct *task)
-{
- struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 337608f408ce..01976c8e7d49 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -155,13 +155,17 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid local child partitions */
- int nr_subparts;
-
/* partition root state */
int partition_root_state;
/*
+ * Whether cpuset is a remote partition.
+ * It used to be a list anchoring all remote partitions — we can switch back
+ * to a list if we need to iterate over the remote partitions.
+ */
+ bool remote_partition;
+
+ /*
* number of SCHED_DEADLINE tasks attached to this cpuset, so that we
* know when to rebuild associated root domain bandwidth information.
*/
@@ -175,9 +179,6 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
- /* Remote partition silbling list anchored at remote_children */
- struct list_head remote_sibling;
-
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
};
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4aaad07b0bd1..6e6eb09b8db6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -82,14 +82,18 @@ static cpumask_var_t subpartitions_cpus;
static cpumask_var_t isolated_cpus;
/*
+ * isolated_cpus updating flag (protected by cpuset_mutex)
+ * Set if isolated_cpus is going to be updated in the current
+ * cpuset_mutex crtical section.
+ */
+static bool isolated_cpus_updating;
+
+/*
* Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
*/
static cpumask_var_t boot_hk_cpus;
static bool have_boot_isolcpus;
-/* List of remote partition root children */
-static struct list_head remote_children;
-
/*
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
@@ -212,7 +216,7 @@ static struct cpuset top_cpuset = {
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
.relax_domain_level = -1,
- .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
+ .remote_partition = false,
};
/*
@@ -352,33 +356,55 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
* @excluded_child: a child cpuset to be excluded in task checking
* Return: true if there are tasks, false otherwise
*
- * It is assumed that @cs is a valid partition root. @excluded_child should
- * be non-NULL when this cpuset is going to become a partition itself.
+ * @cs should be a valid partition root or going to become a partition root.
+ * @excluded_child should be non-NULL when this cpuset is going to become a
+ * partition itself.
+ *
+ * Note that a remote partition is not allowed underneath a valid local
+ * or remote partition. So if a non-partition root child is populated,
+ * the whole partition is considered populated.
*/
static inline bool partition_is_populated(struct cpuset *cs,
struct cpuset *excluded_child)
{
- struct cgroup_subsys_state *css;
- struct cpuset *child;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
- if (cs->css.cgroup->nr_populated_csets)
+ /*
+ * We cannot call cs_is_populated(cs) directly, as
+ * nr_populated_domain_children may include populated
+ * csets from descendants that are partitions.
+ */
+ if (cs->css.cgroup->nr_populated_csets ||
+ cs->attach_in_progress)
return true;
- if (!excluded_child && !cs->nr_subparts)
- return cgroup_is_populated(cs->css.cgroup);
rcu_read_lock();
- cpuset_for_each_child(child, css, cs) {
- if (child == excluded_child)
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ if (cp == cs || cp == excluded_child)
continue;
- if (is_partition_valid(child))
+
+ if (is_partition_valid(cp)) {
+ pos_css = css_rightmost_descendant(pos_css);
continue;
- if (cgroup_is_populated(child->css.cgroup)) {
+ }
+
+ if (cpuset_is_populated(cp)) {
rcu_read_unlock();
return true;
}
@@ -663,7 +689,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (cpuset_is_populated(cur)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -1302,7 +1328,6 @@ static void reset_partition_data(struct cpuset *cs)
lockdep_assert_held(&callback_lock);
- cs->nr_subparts = 0;
if (cpumask_empty(cs->exclusive_cpus)) {
cpumask_clear(cs->effective_xcpus);
if (is_cpu_exclusive(cs))
@@ -1325,6 +1350,8 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
else
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+
+ isolated_cpus_updating = true;
}
/*
@@ -1332,15 +1359,12 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus
* @new_prs: new partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be added
- * Return: true if isolated_cpus modified, false otherwise
*
* Remote partition if parent == NULL
*/
-static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+static void partition_xcpus_add(int new_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
- bool isolcpus_updated;
-
WARN_ON_ONCE(new_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
@@ -1350,13 +1374,11 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
if (parent == &top_cpuset)
cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
- isolcpus_updated = (new_prs != parent->partition_root_state);
- if (isolcpus_updated)
+ if (new_prs != parent->partition_root_state)
isolated_cpus_update(parent->partition_root_state, new_prs,
xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
- return isolcpus_updated;
}
/*
@@ -1364,15 +1386,12 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
* @old_prs: old partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be removed
- * Return: true if isolated_cpus modified, false otherwise
*
* Remote partition if parent == NULL
*/
-static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+static void partition_xcpus_del(int old_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
- bool isolcpus_updated;
-
WARN_ON_ONCE(old_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
@@ -1381,30 +1400,95 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
if (parent == &top_cpuset)
cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
- isolcpus_updated = (old_prs != parent->partition_root_state);
- if (isolcpus_updated)
+ if (old_prs != parent->partition_root_state)
isolated_cpus_update(old_prs, parent->partition_root_state,
xcpus);
cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
- return isolcpus_updated;
}
-static void update_isolation_cpumasks(bool isolcpus_updated)
+/*
+ * isolated_cpus_can_update - check for isolated & nohz_full conflicts
+ * @add_cpus: cpu mask for cpus that are going to be isolated
+ * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL
+ * Return: false if there is conflict, true otherwise
+ *
+ * If nohz_full is enabled and we have isolated CPUs, their combination must
+ * still leave housekeeping CPUs.
+ *
+ * TBD: Should consider merging this function into
+ * prstate_housekeeping_conflict().
+ */
+static bool isolated_cpus_can_update(struct cpumask *add_cpus,
+ struct cpumask *del_cpus)
{
- int ret;
+ cpumask_var_t full_hk_cpus;
+ int res = true;
- lockdep_assert_cpus_held();
+ if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
+ return true;
- if (!isolcpus_updated)
+ if (del_cpus && cpumask_weight_and(del_cpus,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
+ return true;
+
+ if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))
+ return false;
+
+ cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);
+ cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);
+ if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))
+ res = false;
+
+ free_cpumask_var(full_hk_cpus);
+ return res;
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+ if (!have_boot_isolcpus)
+ return false;
+
+ if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ return true;
+
+ return false;
+}
+
+/*
+ * update_isolation_cpumasks - Update external isolation related CPU masks
+ *
+ * The following external CPU masks will be updated if necessary:
+ * - workqueue unbound cpumask
+ */
+static void update_isolation_cpumasks(void)
+{
+ int ret;
+
+ if (!isolated_cpus_updating)
return;
+ lockdep_assert_cpus_held();
+
ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0);
ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0);
+
+ isolated_cpus_updating = false;
}
/**
@@ -1508,7 +1592,7 @@ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
static inline bool is_remote_partition(struct cpuset *cs)
{
- return !list_empty(&cs->remote_sibling);
+ return cs->remote_partition;
}
static inline bool is_local_partition(struct cpuset *cs)
@@ -1529,8 +1613,6 @@ static inline bool is_local_partition(struct cpuset *cs)
static int remote_partition_enable(struct cpuset *cs, int new_prs,
struct tmpmasks *tmp)
{
- bool isolcpus_updated;
-
/*
* The user must have sysadmin privilege.
*/
@@ -1552,13 +1634,17 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
return PERR_INVCPUS;
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->new_cpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, tmp->new_cpus))
+ return PERR_HKEEPING;
spin_lock_irq(&callback_lock);
- isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
- list_add(&cs->remote_sibling, &remote_children);
+ partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ cs->remote_partition = true;
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks(isolcpus_updated);
+ update_isolation_cpumasks();
cpuset_force_rebuild();
cs->prs_err = 0;
@@ -1581,15 +1667,12 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
*/
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
- bool isolcpus_updated;
-
WARN_ON_ONCE(!is_remote_partition(cs));
WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
- list_del_init(&cs->remote_sibling);
- isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
- NULL, cs->effective_xcpus);
+ cs->remote_partition = false;
+ partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);
if (cs->prs_err)
cs->partition_root_state = -cs->partition_root_state;
else
@@ -1599,7 +1682,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks(isolcpus_updated);
+ update_isolation_cpumasks();
cpuset_force_rebuild();
/*
@@ -1624,7 +1707,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
{
bool adding, deleting;
int prs = cs->partition_root_state;
- int isolcpus_updated = 0;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
@@ -1651,15 +1733,18 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
cs->prs_err = PERR_NOCPUS;
+ else if ((prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ cs->prs_err = PERR_HKEEPING;
if (cs->prs_err)
goto invalidate;
}
spin_lock_irq(&callback_lock);
if (adding)
- isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
+ partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
- isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
+ partition_xcpus_del(prs, NULL, tmp->delmask);
/*
* Need to update effective_xcpus and exclusive_cpus now as
* update_sibling_cpumasks() below may iterate back to the same cs.
@@ -1668,7 +1753,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks(isolcpus_updated);
+ update_isolation_cpumasks();
if (adding || deleting)
cpuset_force_rebuild();
@@ -1683,26 +1768,6 @@ invalidate:
remote_partition_disable(cs, tmp);
}
-/*
- * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
- * @prstate: partition root state to be checked
- * @new_cpus: cpu mask
- * Return: true if there is conflict, false otherwise
- *
- * CPUs outside of boot_hk_cpus, if defined, can only be used in an
- * isolated partition.
- */
-static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
-{
- if (!have_boot_isolcpus)
- return false;
-
- if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
- return true;
-
- return false;
-}
-
/**
* update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
* @cs: The cpuset that requests change in partition root state
@@ -1749,9 +1814,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int deleting; /* Deleting cpus from parent's effective_cpus */
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
- int subparts_delta = 0;
- int isolcpus_updated = 0;
struct cpumask *xcpus = user_xcpus(cs);
+ int parent_prs = parent->partition_root_state;
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
@@ -1774,10 +1838,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
- if (old_prs > 0) {
+ if (old_prs > 0)
new_prs = -old_prs;
- subparts_delta--;
- }
+
goto write_error;
}
@@ -1816,6 +1879,10 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (prstate_housekeeping_conflict(new_prs, xcpus))
return PERR_HKEEPING;
+ if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&
+ !isolated_cpus_can_update(xcpus, NULL))
+ return PERR_HKEEPING;
+
if (tasks_nocpu_error(parent, cs, xcpus))
return PERR_NOCPUS;
@@ -1832,7 +1899,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
deleting = true;
- subparts_delta++;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus back to parent's effective_cpus
@@ -1843,7 +1909,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(cs)) {
cpumask_copy(tmp->addmask, cs->effective_xcpus);
adding = true;
- subparts_delta--;
}
new_prs = PRS_MEMBER;
} else if (newmask) {
@@ -1871,6 +1936,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*
* For invalid partition:
* delmask = newmask & parent->effective_xcpus
+ * The partition may become valid soon.
*/
if (is_partition_invalid(cs)) {
adding = false;
@@ -1885,6 +1951,23 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->effective_xcpus);
}
+
+ /*
+ * TBD: Invalidate a currently valid child root partition may
+ * still break isolated_cpus_can_update() rule if parent is an
+ * isolated partition.
+ */
+ if (is_partition_valid(cs) && (old_prs != parent_prs)) {
+ if ((parent_prs == PRS_ROOT) &&
+ /* Adding to parent means removing isolated CPUs */
+ !isolated_cpus_can_update(tmp->delmask, tmp->addmask))
+ part_error = PERR_HKEEPING;
+ if ((parent_prs == PRS_ISOLATED) &&
+ /* Adding to parent means adding isolated CPUs */
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ part_error = PERR_HKEEPING;
+ }
+
/*
* The new CPUs to be removed from parent's effective CPUs
* must be present.
@@ -1966,17 +2049,13 @@ write_error:
switch (cs->partition_root_state) {
case PRS_ROOT:
case PRS_ISOLATED:
- if (part_error) {
+ if (part_error)
new_prs = -old_prs;
- subparts_delta--;
- }
break;
case PRS_INVALID_ROOT:
case PRS_INVALID_ISOLATED:
- if (!part_error) {
+ if (!part_error)
new_prs = -old_prs;
- subparts_delta++;
- }
break;
}
}
@@ -2005,28 +2084,20 @@ write_error:
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (old_prs != new_prs) {
+ if (old_prs != new_prs)
cs->partition_root_state = new_prs;
- if (new_prs <= 0)
- cs->nr_subparts = 0;
- }
+
/*
* Adding to parent's effective_cpus means deletion CPUs from cs
* and vice versa.
*/
if (adding)
- isolcpus_updated += partition_xcpus_del(old_prs, parent,
- tmp->addmask);
+ partition_xcpus_del(old_prs, parent, tmp->addmask);
if (deleting)
- isolcpus_updated += partition_xcpus_add(new_prs, parent,
- tmp->delmask);
+ partition_xcpus_add(new_prs, parent, tmp->delmask);
- if (is_partition_valid(parent)) {
- parent->nr_subparts += subparts_delta;
- WARN_ON_ONCE(parent->nr_subparts < 0);
- }
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks(isolcpus_updated);
+ update_isolation_cpumasks();
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
@@ -2108,8 +2179,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
*/
spin_lock_irq(&callback_lock);
make_partition_invalid(child);
- cs->nr_subparts--;
- child->nr_subparts = 0;
spin_unlock_irq(&callback_lock);
notify_partition_change(child, old_prs);
continue;
@@ -2138,7 +2207,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
- bool need_rebuild_sched_domains = false;
int old_prs, new_prs;
rcu_read_lock();
@@ -2302,15 +2370,12 @@ get_css:
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cpuset_v2() || is_partition_valid(cp)))
- need_rebuild_sched_domains = true;
+ cpuset_force_rebuild();
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
-
- if (need_rebuild_sched_domains)
- cpuset_force_rebuild();
}
/**
@@ -2848,21 +2913,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
*/
retval = nodelist_parse(buf, trialcs->mems_allowed);
if (retval < 0)
- goto done;
+ return retval;
if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
+ top_cpuset.mems_allowed))
+ return -EINVAL;
+
+ /* No change? nothing to do */
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))
+ return 0;
- if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
- retval = 0; /* Too easy - nothing to do */
- goto done;
- }
retval = validate_change(cs, trialcs);
if (retval < 0)
- goto done;
+ return retval;
check_insane_mems_config(&trialcs->mems_allowed);
@@ -2872,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &trialcs->mems_allowed);
-done:
- return retval;
+ return 0;
}
bool current_cpuset_is_being_rebound(void)
@@ -3011,7 +3073,12 @@ static int update_prstate(struct cpuset *cs, int new_prs)
* A change in load balance state only, no change in cpumasks.
* Need to update isolated_cpus.
*/
- isolcpus_updated = true;
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))
+ err = PERR_HKEEPING;
+ else
+ isolcpus_updated = true;
} else {
/*
* Switching back to member is always allowed even if it
@@ -3046,7 +3113,7 @@ out:
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks(isolcpus_updated);
+ update_isolation_cpumasks();
/* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -3552,7 +3619,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
- INIT_LIST_HEAD(&cs->remote_sibling);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3823,7 +3889,6 @@ int __init cpuset_init(void)
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
- INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4024,7 +4089,6 @@ static void cpuset_handle_hotplug(void)
*/
if (!cpumask_empty(subpartitions_cpus)) {
if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
- top_cpuset.nr_subparts = 0;
cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
@@ -4119,24 +4183,13 @@ void __init cpuset_init_smp(void)
BUG_ON(!cpuset_migrate_mm_wq);
}
-/**
- * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
- *
- * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
- * attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_active_mask, even if this means going outside the
- * tasks cpuset, except when the task is in the top cpuset.
- **/
-
-void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+/*
+ * Return cpus_allowed mask from a task's cpuset.
+ */
+static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- unsigned long flags;
struct cpuset *cs;
- spin_lock_irqsave(&callback_lock, flags);
-
cs = task_cs(tsk);
if (cs != &top_cpuset)
guarantee_active_cpus(tsk, pmask);
@@ -4156,7 +4209,39 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
if (!cpumask_intersects(pmask, cpu_active_mask))
cpumask_copy(pmask, possible_mask);
}
+}
+/**
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Similir to cpuset_cpus_allowed() except that the caller must have acquired
+ * cpuset_mutex.
+ */
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+ lockdep_assert_held(&cpuset_mutex);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of cpu_active_mask, even if this means going outside the
+ * tasks cpuset, except when the task is in the top cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e81327d2cd63..9f6ab7dabf67 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y
#
# Debug Oops, Lockups and Hangs
#
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index b0f0d15085db..7481fbb947d3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
#ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
+static int cpu_pm_suspend(void *data)
{
int ret;
@@ -185,20 +185,24 @@ static int cpu_pm_suspend(void)
return ret;
}
-static void cpu_pm_resume(void)
+static void cpu_pm_resume(void *data)
{
cpu_cluster_pm_exit();
cpu_pm_exit();
}
-static struct syscore_ops cpu_pm_syscore_ops = {
+static const struct syscore_ops cpu_pm_syscore_ops = {
.suspend = cpu_pm_suspend,
.resume = cpu_pm_resume,
};
+static struct syscore cpu_pm_syscore = {
+ .ops = &cpu_pm_syscore_ops,
+};
+
static int cpu_pm_init(void)
{
- register_syscore_ops(&cpu_pm_syscore_ops);
+ register_syscore(&cpu_pm_syscore);
return 0;
}
core_initcall(cpu_pm_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 87bf4d41eabb..62e60e0223cf 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size)
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
if (crashk_res.start < crashk_res.end)
insert_resource(&iomem_resource, &crashk_res);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index b12b9db75c1d..61c1690058ed 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len)
*/
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- if (!(console_srcu_read_flags(c) & CON_ENABLED))
+ short flags = console_srcu_read_flags(c);
+
+ if (!console_is_usable(c, flags, true))
continue;
if (c == dbg_io_ops->cons)
continue;
- if (!c->write)
- continue;
- /*
- * Set oops_in_progress to encourage the console drivers to
- * disregard their internal spin locks: in the current calling
- * context the risk of deadlock is a bigger problem than risks
- * due to re-entering the console driver. We operate directly on
- * oops_in_progress rather than using bust_spinlocks() because
- * the calls bust_spinlocks() makes on exit are not appropriate
- * for this calling context.
- */
- ++oops_in_progress;
- c->write(c, msg, msg_len);
- --oops_in_progress;
+
+ if (flags & CON_NBCON) {
+ struct nbcon_write_context wctxt = { };
+
+ /*
+ * Do not continue if the console is NBCON and the context
+ * can't be acquired.
+ */
+ if (!nbcon_kdb_try_acquire(c, &wctxt))
+ continue;
+
+ nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len);
+
+ c->write_atomic(c, &wctxt);
+ nbcon_kdb_release(&wctxt);
+ } else {
+ /*
+ * Set oops_in_progress to encourage the console drivers to
+ * disregard their internal spin locks: in the current calling
+ * context the risk of deadlock is a bigger problem than risks
+ * due to re-entering the console driver. We operate directly on
+ * oops_in_progress rather than using bust_spinlocks() because
+ * the calls bust_spinlocks() makes on exit are not appropriate
+ * for this calling context.
+ */
+ ++oops_in_progress;
+ c->write(c, msg, msg_len);
+ --oops_in_progress;
+ }
touch_nmi_watchdog();
}
console_srcu_read_unlock(cookie);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d9b9dcba6ff7..d8fd6f779f79 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,6 +42,7 @@
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/sizes.h>
+#include <linux/dma-buf/heaps/cma.h>
#include <linux/dma-map-ops.h>
#include <linux/cma.h>
#include <linux/nospec.h>
@@ -241,6 +242,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
}
if (selected_size && !dma_contiguous_default_area) {
+ int ret;
+
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
@@ -248,6 +251,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
selected_limit,
&dma_contiguous_default_area,
fixed);
+
+ ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
+ if (ret)
+ pr_warn("Couldn't register default CMA heap.");
}
}
@@ -493,6 +500,10 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
+ err = dma_heap_cma_register_heap(cma);
+ if (err)
+ pr_warn("Couldn't register CMA heap.");
+
return 0;
}
RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f973e7e73c90..50c3fe2a1d55 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
}
break;
case PCI_P2PDMA_MAP_BUS_ADDR:
- sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
- sg_phys(sg));
+ sg->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(sg));
sg_dma_len(sg) = sg->length;
sg_dma_mark_bus_address(sg);
continue;
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 92de80e5b057..16a51736a2a3 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -11,17 +11,16 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
}
-static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
{
return DMA_MAPPING_ERROR;
}
-static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle,
+static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
/*
- * Dummy ops doesn't support map_page, so unmap_page should never be
+ * Dummy ops doesn't support map_phys, so unmap_page should never be
* called.
*/
WARN_ON_ONCE(true);
@@ -51,8 +50,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask)
const struct dma_map_ops dma_dummy_ops = {
.mmap = dma_dummy_mmap,
- .map_page = dma_dummy_map_page,
- .unmap_page = dma_dummy_unmap_page,
+ .map_phys = dma_dummy_map_phys,
+ .unmap_phys = dma_dummy_unmap_phys,
.map_sg = dma_dummy_map_sg,
.unmap_sg = dma_dummy_unmap_sg,
.dma_supported = dma_dummy_supported,
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index cc19a3efea89..794041a39e65 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,13 +11,13 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
-#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
+#include <uapi/linux/map_benchmark.h>
struct map_benchmark_data {
struct map_benchmark bparam;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index fe7472f13b10..37163eb49f9f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
bool is_mmio = attrs & DMA_ATTR_MMIO;
- dma_addr_t addr;
+ dma_addr_t addr = DMA_MAPPING_ERROR;
BUG_ON(!valid_dma_direction(dir));
@@ -169,21 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
else if (use_dma_iommu(dev))
addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
- else if (is_mmio) {
- if (!ops->map_resource)
- return DMA_MAPPING_ERROR;
-
- addr = ops->map_resource(dev, phys, size, dir, attrs);
- } else {
- struct page *page = phys_to_page(phys);
- size_t offset = offset_in_page(phys);
-
- /*
- * The dma_ops API contract for ops->map_page() requires
- * kmappable memory, while ops->map_resource() does not.
- */
- addr = ops->map_page(dev, page, offset, size, dir, attrs);
- }
+ else if (ops->map_phys)
+ addr = ops->map_phys(dev, phys, size, dir, attrs);
if (!is_mmio)
kmsan_handle_dma(phys, size, dir);
@@ -223,11 +210,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
dma_direct_unmap_phys(dev, addr, size, dir, attrs);
else if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
- else if (is_mmio) {
- if (ops->unmap_resource)
- ops->unmap_resource(dev, addr, size, dir, attrs);
- } else
- ops->unmap_page(dev, addr, size, dir, attrs);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, addr, size, dir, attrs);
trace_dma_unmap_phys(dev, addr, size, dir, attrs);
debug_dma_unmap_phys(dev, addr, size, dir);
}
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 6f9d604d9d40..20caf9cabf69 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
struct page *page;
+ phys_addr_t phys;
page = dma_alloc_contiguous(dev, size, gfp);
if (!page)
@@ -71,11 +72,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
+ phys = page_to_phys(page);
if (use_dma_iommu(dev))
- *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size,
- dir, DMA_ATTR_SKIP_CPU_SYNC);
+ *dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
else
- *dma_handle = ops->map_page(dev, page, 0, size, dir,
+ *dma_handle = ops->map_phys(dev, phys, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
if (*dma_handle == DMA_MAPPING_ERROR) {
dma_free_contiguous(dev, page, size);
@@ -94,8 +96,8 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
- else if (ops->unmap_page)
- ops->unmap_page(dev, dma_handle, size, dir,
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
dma_free_contiguous(dev, page, size);
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index ee45dee33d49..26392badc36b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -93,7 +93,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
page = dma_alloc_from_contiguous(NULL, 1 << order,
order, false);
if (!page)
- page = alloc_pages(gfp, order);
+ page = alloc_pages(gfp | __GFP_NOWARN, order);
} while (!page && order-- > 0);
if (!page)
goto out;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 0d37da3d95b6..a547c7693135 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -61,8 +61,6 @@
*/
#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-
/**
* struct io_tlb_slot - IO TLB slot descriptor
* @orig_addr: The original address corresponding to a mapped entry.
diff --git a/kernel/exit.c b/kernel/exit.c
index b9667ffcf7b3..8a87021211ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -251,13 +251,11 @@ repeat:
memset(&post, 0, sizeof(post));
/* don't need to get the RCU readlock here - the process is dead and
- * can't be modifying its own credentials. But shut RCU-lockdep up */
- rcu_read_lock();
+ * can't be modifying its own credentials. */
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- rcu_read_unlock();
pidfs_exit(p);
- cgroup_release(p);
+ cgroup_task_release(p);
/* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
thread_pid = task_pid(p);
@@ -974,7 +972,7 @@ void __noreturn do_exit(long code)
exit_thread(tsk);
sched_autogroup_exit_task(tsk);
- cgroup_exit(tsk);
+ cgroup_task_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
diff --git a/kernel/fork.c b/kernel/fork.c
index 83e05d6f2307..b1f3915d5f8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,9 +106,9 @@
#include <linux/pidfs.h>
#include <linux/tick.h>
#include <linux/unwind_deferred.h>
-
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#include <linux/uaccess.h>
+
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -208,15 +208,62 @@ struct vm_stack {
struct vm_struct *stack_vm_area;
};
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ unsigned int i;
+
+ /*
+ * If the node has memory, we are guaranteed the stacks are backed by local pages.
+ * Otherwise the pages are arbitrary.
+ *
+ * Note that depending on cpuset it is possible we will get migrated to a different
+ * node immediately after allocating here, so this does *not* guarantee locality for
+ * arbitrary callers.
+ */
+ scoped_guard(preempt) {
+ if (node != NUMA_NO_NODE && numa_node_id() != node)
+ return NULL;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (vm_area)
+ return vm_area;
+ }
+ }
+
+ return NULL;
+}
+
static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
unsigned int i;
+ int nid;
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *tmp = NULL;
+ /*
+ * Don't cache stacks if any of the pages don't match the local domain, unless
+ * there is no local memory to begin with.
+ *
+ * Note that lack of local memory does not automatically mean it makes no difference
+ * performance-wise which other domain backs the stack. In this case we are merely
+ * trying to avoid constantly going to vmalloc.
+ */
+ scoped_guard(preempt) {
+ nid = numa_node_id();
+ if (node_state(nid, N_MEMORY)) {
+ for (i = 0; i < vm_area->nr_pages; i++) {
+ struct page *page = vm_area->pages[i];
+ if (page_to_nid(page) != nid)
+ return false;
+ }
+ }
- if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
- return true;
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *tmp = NULL;
+
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+ return true;
+ }
}
return false;
}
@@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct vm_struct *vm_area;
void *stack;
- int i;
-
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- vm_area = this_cpu_xchg(cached_stacks[i], NULL);
- if (!vm_area)
- continue;
+ vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+ if (vm_area) {
if (memcg_charge_kernel_stack(vm_area)) {
vfree(vm_area->addr);
return -ENOMEM;
@@ -736,9 +779,8 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current);
unwind_task_free(tsk);
- sched_ext_free(tsk);
io_uring_free(tsk);
- cgroup_free(tsk);
+ cgroup_task_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
@@ -1059,10 +1101,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (current->mm) {
unsigned long flags = __mm_flags_get_word(current->mm);
- __mm_flags_set_word(mm, mmf_init_legacy_flags(flags));
+ __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- __mm_flags_set_word(mm, default_dump_filter);
+ __mm_flags_overwrite_word(mm, default_dump_filter);
mm->def_flags = 0;
}
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index e2bbe5509ec2..1c2dd03f11ec 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = restart->futex.uaddr;
- ktime_t t, *tp = NULL;
+ ktime_t *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT)
+ tp = &restart->futex.time;
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t = restart->futex.time;
- tp = &t;
- }
restart->fn = do_no_restart_syscall;
return (long)futex_wait(uaddr, restart->futex.flags,
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b2c1f14b8129..d2254c91450b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -24,6 +24,7 @@
#include <linux/sched/sysctl.h>
#include <linux/hung_task.h>
#include <linux/rwsem.h>
+#include <linux/sys_info.h>
#include <trace/events/sched.h>
@@ -50,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count;
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
-EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
@@ -60,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
-static bool hung_task_show_lock;
static bool hung_task_call_panic;
-static bool hung_task_show_all_bt;
static struct task_struct *watchdog_task;
+/*
+ * A bitmask to control what kinds of system info to be printed when
+ * a hung task is detected, it could be task, memory, lock etc. Refer
+ * include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long hung_task_si_mask;
+
#ifdef CONFIG_SMP
/*
* Should we dump all CPUs backtraces in a hung task event?
@@ -81,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* hung task is detected:
*/
static unsigned int __read_mostly sysctl_hung_task_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -218,8 +223,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti
}
#endif
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
{
+ unsigned long total_hung_task;
+
if (!task_is_hung(t, timeout))
return;
@@ -229,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
*/
sysctl_hung_task_detect_count++;
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
console_verbose();
- hung_task_show_lock = true;
hung_task_call_panic = true;
}
@@ -256,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
" disables this message.\n");
sched_show_task(t);
debug_show_blocker(t, timeout);
- hung_task_show_lock = true;
- if (sysctl_hung_task_all_cpu_backtrace)
- hung_task_show_all_bt = true;
if (!sysctl_hung_task_warnings)
pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
@@ -300,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ int need_warning = sysctl_hung_task_warnings;
+ unsigned long si_mask = hung_task_si_mask;
/*
* If the system crashed already then all bets are off,
@@ -308,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
- hung_task_show_lock = false;
+
rcu_read_lock();
for_each_process_thread(g, t) {
@@ -320,18 +328,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
- check_hung_task(t, timeout);
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
- if (hung_task_show_lock)
- debug_show_all_locks();
- if (hung_task_show_all_bt) {
- hung_task_show_all_bt = false;
- trigger_all_cpu_backtrace();
+ if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ return;
+
+ if (need_warning || hung_task_call_panic) {
+ si_mask |= SYS_INFO_LOCKS;
+
+ if (sysctl_hung_task_all_cpu_backtrace)
+ si_mask |= SYS_INFO_ALL_BT;
}
+ sys_info(si_mask);
+
if (hung_task_call_panic)
panic("hung_task: blocked tasks");
}
@@ -389,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "hung_task_check_count",
@@ -430,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0444,
.proc_handler = proc_doulongvec_minmax,
},
+ {
+ .procname = "hung_task_sys_info",
+ .data = &hung_task_si_mask,
+ .maxlen = sizeof(hung_task_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
};
static void __init hung_task_sysctl_init(void)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index bf59e37d650a..3cd0c40282c0 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
}
#ifdef CONFIG_PM
-static int irq_gc_suspend(void)
+static int irq_gc_suspend(void *data)
{
struct irq_chip_generic *gc;
@@ -670,7 +670,7 @@ static int irq_gc_suspend(void)
return 0;
}
-static void irq_gc_resume(void)
+static void irq_gc_resume(void *data)
{
struct irq_chip_generic *gc;
@@ -693,7 +693,7 @@ static void irq_gc_resume(void)
#define irq_gc_resume NULL
#endif
-static void irq_gc_shutdown(void)
+static void irq_gc_shutdown(void *data)
{
struct irq_chip_generic *gc;
@@ -709,15 +709,19 @@ static void irq_gc_shutdown(void)
}
}
-static struct syscore_ops irq_gc_syscore_ops = {
+static const struct syscore_ops irq_gc_syscore_ops = {
.suspend = irq_gc_suspend,
.resume = irq_gc_resume,
.shutdown = irq_gc_shutdown,
};
+static struct syscore irq_gc_syscore = {
+ .ops = &irq_gc_syscore_ops,
+};
+
static int __init irq_gc_init_ops(void)
{
- register_syscore_ops(&irq_gc_syscore_ops);
+ register_syscore(&irq_gc_syscore);
return 0;
}
device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6acf268f005b..f8e4e13dbe33 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -720,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
* This function must be called from an IRQ context with irq regs
* initialized.
*/
-int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq)
{
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
@@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
* context). If the interrupt is marked as 'enforce IRQ-context only' then
* the function must be invoked from hard interrupt context.
*/
-int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq)
{
unsigned long flags;
int ret;
@@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
* This function must be called from an NMI context with irq regs
* initialized.
**/
-int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
+int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq)
{
WARN_ON_ONCE(!in_nmi());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f7394729cedc..99ff65466d87 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq)
/**
* irq_pm_syscore_resume - enable interrupt lines early
+ * @data: syscore context
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
-static void irq_pm_syscore_resume(void)
+static void irq_pm_syscore_resume(void *data)
{
resume_irqs(true);
}
-static struct syscore_ops irq_pm_syscore_ops = {
+static const struct syscore_ops irq_pm_syscore_ops = {
.resume = irq_pm_syscore_resume,
};
+static struct syscore irq_pm_syscore = {
+ .ops = &irq_pm_syscore_ops,
+};
+
static int __init irq_pm_init_ops(void)
{
- register_syscore_ops(&irq_pm_syscore_ops);
+ register_syscore(&irq_pm_syscore);
return 0;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1e7635864124..049e296f586c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -103,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off)
{
/*
* Get just the first code, look it up in the token table,
- * and return the first char from this token.
+ * and return the first char from this token. If MSB of length
+ * is 1, it is a "big" symbol, so needs an additional byte.
*/
+ if (kallsyms_names[off] & 0x80)
+ off++;
return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fa00b239c5d9..0f92acdd354d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -15,6 +15,7 @@
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/liveupdate.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
@@ -41,6 +42,7 @@
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
#include <linux/dma-map-ops.h>
+#include <linux/sysfs.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -742,7 +744,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
struct kexec_segment *segment = &image->segment[idx];
struct page *cma = image->segment_cma[idx];
char *ptr = page_address(cma);
- unsigned long maddr;
size_t ubytes, mbytes;
int result = 0;
unsigned char __user *buf = NULL;
@@ -754,15 +755,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
- maddr = segment->mem;
/* Then copy from source buffer to the CMA one */
while (mbytes) {
size_t uchunk, mchunk;
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (uchunk) {
@@ -784,7 +782,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
}
ptr += mchunk;
- maddr += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -839,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx)
ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (uchunk) {
@@ -904,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx)
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap_local_page(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (mchunk > uchunk) {
/* Zero the trailing part of the page */
@@ -1146,6 +1139,10 @@ int kernel_kexec(void)
goto Unlock;
}
+ error = liveupdate_reboot();
+ if (error)
+ goto Unlock;
+
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
/*
@@ -1229,3 +1226,143 @@ int kernel_kexec(void)
kexec_unlock();
return error;
}
+
+static ssize_t loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
+}
+static struct kobj_attribute loaded_attr = __ATTR_RO(loaded);
+
+#ifdef CONFIG_CRASH_DUMP
+static ssize_t crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
+}
+static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded);
+
+#ifdef CONFIG_CRASH_RESERVE
+static ssize_t crash_cma_ranges_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ len += sysfs_emit_at(buf, len, "%08llx-%08llx\n",
+ crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ }
+ return len;
+}
+static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges);
+#endif
+
+static ssize_t crash_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
+}
+static ssize_t crash_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long cnt;
+ int ret;
+
+ if (kstrtoul(buf, 0, &cnt))
+ return -EINVAL;
+
+ ret = crash_shrink_memory(cnt);
+ return ret < 0 ? ret : count;
+}
+static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size);
+
+#endif /* CONFIG_CRASH_HOTPLUG */
+#endif /* CONFIG_CRASH_DUMP */
+
+static struct attribute *kexec_attrs[] = {
+ &loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
+ &crash_loaded_attr.attr,
+ &crash_size_attr.attr,
+#ifdef CONFIG_CRASH_RESERVE
+ &crash_cma_ranges_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
+#endif
+ NULL
+};
+
+struct kexec_link_entry {
+ const char *target;
+ const char *name;
+};
+
+static struct kexec_link_entry kexec_links[] = {
+ { "loaded", "kexec_loaded" },
+#ifdef CONFIG_CRASH_DUMP
+ { "crash_loaded", "kexec_crash_loaded" },
+ { "crash_size", "kexec_crash_size" },
+#ifdef CONFIG_CRASH_RESERVE
+ {"crash_cma_ranges", "kexec_crash_cma_ranges"},
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ { "crash_elfcorehdr_size", "crash_elfcorehdr_size" },
+#endif
+#endif
+};
+
+static struct kobject *kexec_kobj;
+ATTRIBUTE_GROUPS(kexec);
+
+static int __init init_kexec_sysctl(void)
+{
+ int error;
+ int i;
+
+ kexec_kobj = kobject_create_and_add("kexec", kernel_kobj);
+ if (!kexec_kobj) {
+ pr_err("failed to create kexec kobject\n");
+ return -ENOMEM;
+ }
+
+ error = sysfs_create_groups(kexec_kobj, kexec_groups);
+ if (error)
+ goto kset_exit;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_links); i++) {
+ error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj,
+ kexec_links[i].target,
+ kexec_links[i].name);
+ if (error)
+ pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error);
+ }
+
+ return 0;
+
+kset_exit:
+ kobject_put(kexec_kobj);
+ return error;
+}
+
+subsys_initcall(init_kexec_sysctl);
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h
deleted file mode 100644
index 3c3c7148ceed..000000000000
--- a/kernel/kexec_handover_internal.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
-#define LINUX_KEXEC_HANDOVER_INTERNAL_H
-
-#include <linux/kexec_handover.h>
-#include <linux/types.h>
-
-extern struct kho_scratch *kho_scratch;
-extern unsigned int kho_scratch_cnt;
-
-#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
-bool kho_scratch_overlap(phys_addr_t phys, size_t size);
-#else
-static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
-{
- return false;
-}
-#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
-
-#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c
index e49bb88b4f0a..d4449884084c 100644
--- a/kernel/kstack_erase.c
+++ b/kernel/kstack_erase.c
@@ -23,7 +23,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
#ifdef CONFIG_SYSCTL
static int stack_erasing_sysctl(const struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
int state = !static_branch_unlikely(&stack_erasing_bypass);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index eefb67d9883c..a9e6354d9e25 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -12,7 +12,7 @@
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
@@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC_CORE
-static ssize_t kexec_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%d\n", !!kexec_image);
-}
-KERNEL_ATTR_RO(kexec_loaded);
-
-#ifdef CONFIG_CRASH_DUMP
-static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
-}
-KERNEL_ATTR_RO(kexec_crash_loaded);
-
-static ssize_t kexec_crash_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- ssize_t size = crash_get_memory_size();
-
- if (size < 0)
- return size;
-
- return sysfs_emit(buf, "%zd\n", size);
-}
-static ssize_t kexec_crash_size_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long cnt;
- int ret;
-
- if (kstrtoul(buf, 0, &cnt))
- return -EINVAL;
-
- ret = crash_shrink_memory(cnt);
- return ret < 0 ? ret : count;
-}
-KERNEL_ATTR_RW(kexec_crash_size);
-
-#endif /* CONFIG_CRASH_DUMP*/
-#endif /* CONFIG_KEXEC_CORE */
-
#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
@@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#ifdef CONFIG_CRASH_HOTPLUG
-static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- unsigned int sz = crash_get_elfcorehdr_size();
-
- return sysfs_emit(buf, "%u\n", sz);
-}
-KERNEL_ATTR_RO(crash_elfcorehdr_size);
-
-#endif
-
#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
@@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC_CORE
- &kexec_loaded_attr.attr,
-#ifdef CONFIG_CRASH_DUMP
- &kexec_crash_loaded_attr.attr,
- &kexec_crash_size_attr.attr,
-#endif
-#endif
#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
-#ifdef CONFIG_CRASH_HOTPLUG
- &crash_elfcorehdr_size_attr.attr,
-#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 0044a8125013..9917756dae46 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -88,8 +88,14 @@ static struct klp_func *klp_find_func(struct klp_object *obj,
struct klp_func *func;
klp_for_each_func(obj, func) {
+ /*
+ * Besides identical old_sympos, also consider old_sympos
+ * of 0 and 1 are identical.
+ */
if ((strcmp(old_func->old_name, func->old_name) == 0) &&
- (old_func->old_sympos == func->old_sympos)) {
+ ((old_func->old_sympos == func->old_sympos) ||
+ (old_func->old_sympos == 0 && func->old_sympos == 1) ||
+ (old_func->old_sympos == 1 && func->old_sympos == 0))) {
return func;
}
}
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
new file mode 100644
index 000000000000..9b2515f31afb
--- /dev/null
+++ b/kernel/liveupdate/Kconfig
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Live Update and Kexec HandOver"
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
+config KEXEC_HANDOVER_DEBUGFS
+ bool "kexec handover debugfs interface"
+ default KEXEC_HANDOVER
+ depends on KEXEC_HANDOVER
+ select DEBUG_FS
+ help
+ Allow to control kexec handover device tree via debugfs
+ interface, i.e. finalize the state or aborting the finalization.
+ Also, enables inspecting the KHO fdt trees with the debugfs binary
+ blobs.
+
+config KEXEC_HANDOVER_ENABLE_DEFAULT
+ bool "Enable kexec handover by default"
+ depends on KEXEC_HANDOVER
+ help
+ Enable Kexec Handover by default. This avoids the need to
+ explicitly pass 'kho=on' on the kernel command line.
+
+ This is useful for systems where KHO is a prerequisite for other
+ features, such as Live Update, ensuring the mechanism is always
+ active.
+
+ The default behavior can still be overridden at boot time by
+ passing 'kho=off'.
+
+config LIVEUPDATE
+ bool "Live Update Orchestrator"
+ depends on KEXEC_HANDOVER
+ help
+ Enable the Live Update Orchestrator. Live Update is a mechanism,
+ typically based on kexec, that allows the kernel to be updated
+ while keeping selected devices operational across the transition.
+ These devices are intended to be reclaimed by the new kernel and
+ re-attached to their original workload without requiring a device
+ reset.
+
+ Ability to handover a device from current to the next kernel depends
+ on specific support within device drivers and related kernel
+ subsystems.
+
+ This feature primarily targets virtual machine hosts to quickly update
+ the kernel hypervisor with minimal disruption to the running virtual
+ machines.
+
+ If unsure, say N.
+
+endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
new file mode 100644
index 000000000000..7cad2eece32d
--- /dev/null
+++ b/kernel/liveupdate/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+luo-y := \
+ luo_core.o \
+ luo_file.o \
+ luo_session.o
+
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE) += luo.o
diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 03d12e27189f..9dc51fab604f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -4,21 +4,22 @@
* Copyright (C) 2023 Alexander Graf <graf@amazon.com>
* Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
* Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#define pr_fmt(fmt) "KHO: " fmt
#include <linux/cleanup.h>
#include <linux/cma.h>
+#include <linux/kmemleak.h>
#include <linux/count_zeros.h>
-#include <linux/debugfs.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
-#include <linux/notifier.h>
#include <linux/page-isolation.h>
+#include <linux/unaligned.h>
#include <linux/vmalloc.h>
#include <asm/early_ioremap.h>
@@ -28,8 +29,9 @@
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
*/
-#include "../mm/internal.h"
-#include "kexec_internal.h"
+#include "../../mm/internal.h"
+#include "../kexec_internal.h"
+#include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1"
#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
@@ -51,7 +53,7 @@ union kho_page_info {
static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
-static bool kho_enable __ro_after_init;
+static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
bool kho_is_enabled(void)
{
@@ -103,34 +105,19 @@ struct kho_mem_track {
struct khoser_mem_chunk;
-struct kho_serialization {
- struct page *fdt;
- struct list_head fdt_list;
- struct dentry *sub_fdt_dir;
- struct kho_mem_track track;
- /* First chunk of serialized preserved memory map */
- struct khoser_mem_chunk *preserved_mem_map;
-};
-
struct kho_out {
- struct blocking_notifier_head chain_head;
-
- struct dentry *dir;
-
+ void *fdt;
+ bool finalized;
struct mutex lock; /* protects KHO FDT finalization */
- struct kho_serialization ser;
- bool finalized;
+ struct kho_mem_track track;
+ struct kho_debugfs dbg;
};
static struct kho_out kho_out = {
- .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
.lock = __MUTEX_INITIALIZER(kho_out.lock),
- .ser = {
- .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
- .track = {
- .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
- },
+ .track = {
+ .orders = XARRAY_INIT(kho_out.track.orders, 0),
},
.finalized = false,
};
@@ -159,26 +146,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
return no_free_ptr(elm);
}
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
- unsigned long end_pfn)
+static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
{
struct kho_mem_phys_bits *bits;
struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
- while (pfn < end_pfn) {
- const unsigned int order =
- min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- const unsigned long pfn_high = pfn >> order;
+ physxa = xa_load(&track->orders, order);
+ if (WARN_ON_ONCE(!physxa))
+ return;
- physxa = xa_load(&track->orders, order);
- if (WARN_ON_ONCE(!physxa))
- return;
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (WARN_ON_ONCE(!bits))
+ return;
- bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (WARN_ON_ONCE(!bits))
- return;
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+}
- clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ __kho_unpreserve_order(track, pfn, order);
pfn += 1 << order;
}
@@ -192,10 +186,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
const unsigned long pfn_high = pfn >> order;
might_sleep();
-
- if (kho_out.finalized)
- return -EBUSY;
-
physxa = xa_load(&track->orders, order);
if (!physxa) {
int err;
@@ -229,11 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
return 0;
}
-static struct page *kho_restore_page(phys_addr_t phys)
+static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
{
struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned int nr_pages, ref_cnt;
union kho_page_info info;
- unsigned int nr_pages;
if (!page)
return NULL;
@@ -253,11 +243,16 @@ static struct page *kho_restore_page(phys_addr_t phys)
/* Head page gets refcount of 1. */
set_page_count(page, 1);
- /* For higher order folios, tail pages get a page count of zero. */
+ /*
+ * For higher order folios, tail pages get a page count of zero.
+ * For physically contiguous order-0 pages every pages gets a page
+ * count of 1
+ */
+ ref_cnt = is_folio ? 0 : 1;
for (unsigned int i = 1; i < nr_pages; i++)
- set_page_count(page + i, 0);
+ set_page_count(page + i, ref_cnt);
- if (info.order > 0)
+ if (is_folio && info.order)
prep_compound_page(page, info.order);
adjust_managed_page_count(page, nr_pages);
@@ -272,7 +267,7 @@ static struct page *kho_restore_page(phys_addr_t phys)
*/
struct folio *kho_restore_folio(phys_addr_t phys)
{
- struct page *page = kho_restore_page(phys);
+ struct page *page = kho_restore_page(phys, true);
return page ? page_folio(page) : NULL;
}
@@ -297,11 +292,10 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- struct page *page = kho_restore_page(PFN_PHYS(pfn));
+ struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
if (!page)
return NULL;
- split_page(page, order);
pfn += 1 << order;
}
@@ -371,11 +365,32 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
struct khoser_mem_chunk *tmp = chunk;
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- kfree(tmp);
+ free_page((unsigned long)tmp);
}
}
-static int kho_mem_serialize(struct kho_serialization *ser)
+/*
+ * Update memory map property, if old one is found discard it via
+ * kho_mem_ser_free().
+ */
+static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
+{
+ void *ptr;
+ u64 phys;
+
+ ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+
+ /* Check and discard previous memory map */
+ phys = get_unaligned((u64 *)ptr);
+ if (phys)
+ kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
+
+ /* Update with the new value */
+ phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
+ put_unaligned(phys, (u64 *)ptr);
+}
+
+static int kho_mem_serialize(struct kho_out *kho_out)
{
struct khoser_mem_chunk *first_chunk = NULL;
struct khoser_mem_chunk *chunk = NULL;
@@ -383,7 +398,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
unsigned long order;
int err = -ENOMEM;
- xa_for_each(&ser->track.orders, order, physxa) {
+ xa_for_each(&kho_out->track.orders, order, physxa) {
struct kho_mem_phys_bits *bits;
unsigned long phys;
@@ -415,7 +430,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
}
}
- ser->preserved_mem_map = first_chunk;
+ kho_update_memory_map(first_chunk);
return 0;
@@ -445,20 +460,27 @@ static void __init deserialize_bitmap(unsigned int order,
}
}
-static void __init kho_mem_deserialize(const void *fdt)
+/* Return true if memory was deserizlied */
+static bool __init kho_mem_deserialize(const void *fdt)
{
struct khoser_mem_chunk *chunk;
- const phys_addr_t *mem;
+ const void *mem_ptr;
+ u64 mem;
int len;
- mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
-
- if (!mem || len != sizeof(*mem)) {
+ mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ if (!mem_ptr || len != sizeof(u64)) {
pr_err("failed to get preserved memory bitmaps\n");
- return;
+ return false;
}
- chunk = *mem ? phys_to_virt(*mem) : NULL;
+ mem = get_unaligned((const u64 *)mem_ptr);
+ chunk = mem ? phys_to_virt(mem) : NULL;
+
+ /* No preserved physical pages were passed, no deserialization */
+ if (!chunk)
+ return false;
+
while (chunk) {
unsigned int i;
@@ -467,6 +489,8 @@ static void __init kho_mem_deserialize(const void *fdt)
&chunk->bitmaps[i]);
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
}
+
+ return true;
}
/*
@@ -674,40 +698,8 @@ err_disable_kho:
kho_enable = false;
}
-struct fdt_debugfs {
- struct list_head list;
- struct debugfs_blob_wrapper wrapper;
- struct dentry *file;
-};
-
-static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
- const char *name, const void *fdt)
-{
- struct fdt_debugfs *f;
- struct dentry *file;
-
- f = kmalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
- return -ENOMEM;
-
- f->wrapper.data = (void *)fdt;
- f->wrapper.size = fdt_totalsize(fdt);
-
- file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
- if (IS_ERR(file)) {
- kfree(f);
- return PTR_ERR(file);
- }
-
- f->file = file;
- list_add(&f->list, list);
-
- return 0;
-}
-
/**
* kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
- * @ser: serialization control object passed by KHO notifiers.
* @name: name of the sub tree.
* @fdt: the sub tree blob.
*
@@ -716,38 +708,76 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
* by KHO for the new kernel to retrieve it after kexec.
*
* A debugfs blob entry is also created at
- * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
+ * CONFIG_KEXEC_HANDOVER_DEBUGFS
*
* Return: 0 on success, error code on failure
*/
-int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
+int kho_add_subtree(const char *name, void *fdt)
{
- int err = 0;
- u64 phys = (u64)virt_to_phys(fdt);
- void *root = page_to_virt(ser->fdt);
+ phys_addr_t phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int err = -ENOMEM;
+ int off, fdt_err;
- err |= fdt_begin_node(root, name);
- err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
- err |= fdt_end_node(root);
+ guard(mutex)(&kho_out.lock);
- if (err)
+ fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (fdt_err < 0)
return err;
- return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
+ off = fdt_add_subnode(root_fdt, 0, name);
+ if (off < 0) {
+ if (off == -FDT_ERR_EXISTS)
+ err = -EEXIST;
+ goto out_pack;
+ }
+
+ err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ if (err < 0)
+ goto out_pack;
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
+
+out_pack:
+ fdt_pack(root_fdt);
+
+ return err;
}
EXPORT_SYMBOL_GPL(kho_add_subtree);
-int register_kho_notifier(struct notifier_block *nb)
+void kho_remove_subtree(void *fdt)
{
- return blocking_notifier_chain_register(&kho_out.chain_head, nb);
-}
-EXPORT_SYMBOL_GPL(register_kho_notifier);
+ phys_addr_t target_phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int off;
+ int err;
-int unregister_kho_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
+ guard(mutex)(&kho_out.lock);
+
+ err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (err < 0)
+ return;
+
+ for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
+ off = fdt_next_subnode(root_fdt, off)) {
+ const u64 *val;
+ int len;
+
+ val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(phys_addr_t))
+ continue;
+
+ if ((phys_addr_t)*val == target_phys) {
+ fdt_del_node(root_fdt, off);
+ kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+ break;
+ }
+ }
+
+ fdt_pack(root_fdt);
}
-EXPORT_SYMBOL_GPL(unregister_kho_notifier);
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
/**
* kho_preserve_folio - preserve a folio across kexec.
@@ -762,7 +792,7 @@ int kho_preserve_folio(struct folio *folio)
{
const unsigned long pfn = folio_pfn(folio);
const unsigned int order = folio_order(folio);
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
return -EINVAL;
@@ -772,6 +802,24 @@ int kho_preserve_folio(struct folio *folio)
EXPORT_SYMBOL_GPL(kho_preserve_folio);
/**
+ * kho_unpreserve_folio - unpreserve a folio.
+ * @folio: folio to unpreserve.
+ *
+ * Instructs KHO to unpreserve a folio that was preserved by
+ * kho_preserve_folio() before. The provided @folio (pfn and order)
+ * must exactly match a previously preserved folio.
+ */
+void kho_unpreserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ __kho_unpreserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
+/**
* kho_preserve_pages - preserve contiguous pages across kexec
* @page: first page in the list.
* @nr_pages: number of pages.
@@ -783,7 +831,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
*/
int kho_preserve_pages(struct page *page, unsigned int nr_pages)
{
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn = start_pfn;
@@ -815,6 +863,26 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
}
EXPORT_SYMBOL_GPL(kho_preserve_pages);
+/**
+ * kho_unpreserve_pages - unpreserve contiguous pages.
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
+ * This must be called with the same @page and @nr_pages as the corresponding
+ * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
+ * preserved blocks is not supported.
+ */
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ __kho_unpreserve(track, start_pfn, end_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
+
struct kho_vmalloc_hdr {
DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
};
@@ -885,7 +953,7 @@ err_free:
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
unsigned short order)
{
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1);
@@ -896,20 +964,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
}
}
-static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
-{
- struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
-
- while (chunk) {
- struct kho_vmalloc_chunk *tmp = chunk;
-
- kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
-
- chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- free_page((unsigned long)tmp);
- }
-}
-
/**
* kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
* @ptr: pointer to the area in vmalloc address space
@@ -971,12 +1025,34 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
return 0;
err_free:
- kho_vmalloc_free_chunks(preservation);
+ kho_unpreserve_vmalloc(preservation);
return err;
}
EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
/**
+ * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
+ * @preservation: preservation metadata returned by kho_preserve_vmalloc()
+ *
+ * Instructs KHO to unpreserve the area in vmalloc address space that was
+ * previously preserved with kho_preserve_vmalloc().
+ */
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+
+ while (chunk) {
+ struct kho_vmalloc_chunk *tmp = chunk;
+
+ kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
+
+/**
* kho_restore_vmalloc - recreates and populates an area in vmalloc address
* space from the preserved memory.
* @preservation: preservation metadata.
@@ -1024,7 +1100,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
goto err_free_pages_array;
for (int j = 0; j < contig_pages; j++)
- pages[idx++] = page;
+ pages[idx++] = page + j;
phys += contig_pages * PAGE_SIZE;
}
@@ -1065,217 +1141,122 @@ err_free_pages_array:
}
EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
-/* Handling for debug/kho/out */
-
-static struct dentry *debugfs_root;
-
-static int kho_out_update_debugfs_fdt(void)
-{
- int err = 0;
- struct fdt_debugfs *ff, *tmp;
-
- if (kho_out.finalized) {
- err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
- "fdt", page_to_virt(kho_out.ser.fdt));
- } else {
- list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
- debugfs_remove(ff->file);
- list_del(&ff->list);
- kfree(ff);
- }
- }
-
- return err;
-}
-
-static int kho_abort(void)
+/**
+ * kho_alloc_preserve - Allocate, zero, and preserve memory.
+ * @size: The number of bytes to allocate.
+ *
+ * Allocates a physically contiguous block of zeroed pages that is large
+ * enough to hold @size bytes. The allocated memory is then registered with
+ * KHO for preservation across a kexec.
+ *
+ * Note: The actual allocated size will be rounded up to the nearest
+ * power-of-two page boundary.
+ *
+ * @return A virtual pointer to the allocated and preserved memory on success,
+ * or an ERR_PTR() encoded error on failure.
+ */
+void *kho_alloc_preserve(size_t size)
{
- int err;
- unsigned long order;
- struct kho_mem_phys *physxa;
+ struct folio *folio;
+ int order, ret;
- xa_for_each(&kho_out.ser.track.orders, order, physxa) {
- struct kho_mem_phys_bits *bits;
- unsigned long phys;
+ if (!size)
+ return ERR_PTR(-EINVAL);
- xa_for_each(&physxa->phys_bits, phys, bits)
- kfree(bits);
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return ERR_PTR(-E2BIG);
- xa_destroy(&physxa->phys_bits);
- kfree(physxa);
- }
- xa_destroy(&kho_out.ser.track.orders);
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
- if (kho_out.ser.preserved_mem_map) {
- kho_mem_ser_free(kho_out.ser.preserved_mem_map);
- kho_out.ser.preserved_mem_map = NULL;
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
}
- err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
- NULL);
- err = notifier_to_errno(err);
-
- if (err)
- pr_err("Failed to abort KHO finalization: %d\n", err);
-
- return err;
+ return folio_address(folio);
}
+EXPORT_SYMBOL_GPL(kho_alloc_preserve);
-static int kho_finalize(void)
+/**
+ * kho_unpreserve_free - Unpreserve and free memory.
+ * @mem: Pointer to the memory allocated by kho_alloc_preserve().
+ *
+ * Unregisters the memory from KHO preservation and frees the underlying
+ * pages back to the system. This function should be called to clean up
+ * memory allocated with kho_alloc_preserve().
+ */
+void kho_unpreserve_free(void *mem)
{
- int err = 0;
- u64 *preserved_mem_map;
- void *fdt = page_to_virt(kho_out.ser.fdt);
-
- err |= fdt_create(fdt, PAGE_SIZE);
- err |= fdt_finish_reservemap(fdt);
- err |= fdt_begin_node(fdt, "");
- err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
- /**
- * Reserve the preserved-memory-map property in the root FDT, so
- * that all property definitions will precede subnodes created by
- * KHO callers.
- */
- err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
- sizeof(*preserved_mem_map),
- (void **)&preserved_mem_map);
- if (err)
- goto abort;
-
- err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
- if (err)
- goto abort;
-
- err = blocking_notifier_call_chain(&kho_out.chain_head,
- KEXEC_KHO_FINALIZE, &kho_out.ser);
- err = notifier_to_errno(err);
- if (err)
- goto abort;
-
- err = kho_mem_serialize(&kho_out.ser);
- if (err)
- goto abort;
-
- *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
-
- err |= fdt_end_node(fdt);
- err |= fdt_finish(fdt);
+ struct folio *folio;
-abort:
- if (err) {
- pr_err("Failed to convert KHO state tree: %d\n", err);
- kho_abort();
- }
+ if (!mem)
+ return;
- return err;
+ folio = virt_to_folio(mem);
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
}
+EXPORT_SYMBOL_GPL(kho_unpreserve_free);
-static int kho_out_finalize_get(void *data, u64 *val)
+/**
+ * kho_restore_free - Restore and free memory after kexec.
+ * @mem: Pointer to the memory (in the new kernel's address space)
+ * that was allocated by the old kernel.
+ *
+ * This function is intended to be called in the new kernel (post-kexec)
+ * to take ownership of and free a memory region that was preserved by the
+ * old kernel using kho_alloc_preserve().
+ *
+ * It first restores the pages from KHO (using their physical address)
+ * and then frees the pages back to the new kernel's page allocator.
+ */
+void kho_restore_free(void *mem)
{
- mutex_lock(&kho_out.lock);
- *val = kho_out.finalized;
- mutex_unlock(&kho_out.lock);
+ struct folio *folio;
- return 0;
+ if (!mem)
+ return;
+
+ folio = kho_restore_folio(__pa(mem));
+ if (!WARN_ON(!folio))
+ folio_put(folio);
}
+EXPORT_SYMBOL_GPL(kho_restore_free);
-static int kho_out_finalize_set(void *data, u64 _val)
+int kho_finalize(void)
{
- int ret = 0;
- bool val = !!_val;
-
- mutex_lock(&kho_out.lock);
-
- if (val == kho_out.finalized) {
- if (kho_out.finalized)
- ret = -EEXIST;
- else
- ret = -ENOENT;
- goto unlock;
- }
+ int ret;
- if (val)
- ret = kho_finalize();
- else
- ret = kho_abort();
+ if (!kho_enable)
+ return -EOPNOTSUPP;
+ guard(mutex)(&kho_out.lock);
+ ret = kho_mem_serialize(&kho_out);
if (ret)
- goto unlock;
-
- kho_out.finalized = val;
- ret = kho_out_update_debugfs_fdt();
-
-unlock:
- mutex_unlock(&kho_out.lock);
- return ret;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
- kho_out_finalize_set, "%llu\n");
+ return ret;
-static int scratch_phys_show(struct seq_file *m, void *v)
-{
- for (int i = 0; i < kho_scratch_cnt; i++)
- seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+ kho_out.finalized = true;
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(scratch_phys);
-static int scratch_len_show(struct seq_file *m, void *v)
+bool kho_finalized(void)
{
- for (int i = 0; i < kho_scratch_cnt; i++)
- seq_printf(m, "0x%llx\n", kho_scratch[i].size);
-
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(scratch_len);
-
-static __init int kho_out_debugfs_init(void)
-{
- struct dentry *dir, *f, *sub_fdt_dir;
-
- dir = debugfs_create_dir("out", debugfs_root);
- if (IS_ERR(dir))
- return -ENOMEM;
-
- sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
- if (IS_ERR(sub_fdt_dir))
- goto err_rmdir;
-
- f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
- &scratch_phys_fops);
- if (IS_ERR(f))
- goto err_rmdir;
-
- f = debugfs_create_file("scratch_len", 0400, dir, NULL,
- &scratch_len_fops);
- if (IS_ERR(f))
- goto err_rmdir;
-
- f = debugfs_create_file("finalize", 0600, dir, NULL,
- &fops_kho_out_finalize);
- if (IS_ERR(f))
- goto err_rmdir;
-
- kho_out.dir = dir;
- kho_out.ser.sub_fdt_dir = sub_fdt_dir;
- return 0;
-
-err_rmdir:
- debugfs_remove_recursive(dir);
- return -ENOENT;
+ guard(mutex)(&kho_out.lock);
+ return kho_out.finalized;
}
struct kho_in {
- struct dentry *dir;
phys_addr_t fdt_phys;
phys_addr_t scratch_phys;
- struct list_head fdt_list;
+ struct kho_debugfs dbg;
};
static struct kho_in kho_in = {
- .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
};
static const void *kho_get_fdt(void)
@@ -1339,91 +1320,52 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
}
EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
-/* Handling for debugfs/kho/in */
-
-static __init int kho_in_debugfs_init(const void *fdt)
+static __init int kho_out_fdt_setup(void)
{
- struct dentry *sub_fdt_dir;
- int err, child;
-
- kho_in.dir = debugfs_create_dir("in", debugfs_root);
- if (IS_ERR(kho_in.dir))
- return PTR_ERR(kho_in.dir);
-
- sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
- if (IS_ERR(sub_fdt_dir)) {
- err = PTR_ERR(sub_fdt_dir);
- goto err_rmdir;
- }
-
- err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
- if (err)
- goto err_rmdir;
-
- fdt_for_each_subnode(child, fdt, 0) {
- int len = 0;
- const char *name = fdt_get_name(fdt, child, NULL);
- const u64 *fdt_phys;
-
- fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
- if (!fdt_phys)
- continue;
- if (len != sizeof(*fdt_phys)) {
- pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
- name, len);
- continue;
- }
- err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
- phys_to_virt(*fdt_phys));
- if (err) {
- pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
- err);
- continue;
- }
- }
+ void *root = kho_out.fdt;
+ u64 empty_mem_map = 0;
+ int err;
- return 0;
+ err = fdt_create(root, PAGE_SIZE);
+ err |= fdt_finish_reservemap(root);
+ err |= fdt_begin_node(root, "");
+ err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
+ err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ sizeof(empty_mem_map));
+ err |= fdt_end_node(root);
+ err |= fdt_finish(root);
-err_rmdir:
- debugfs_remove_recursive(kho_in.dir);
return err;
}
static __init int kho_init(void)
{
- int err = 0;
const void *fdt = kho_get_fdt();
+ int err = 0;
if (!kho_enable)
return 0;
- kho_out.ser.fdt = alloc_page(GFP_KERNEL);
- if (!kho_out.ser.fdt) {
- err = -ENOMEM;
+ kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(kho_out.fdt)) {
+ err = PTR_ERR(kho_out.fdt);
goto err_free_scratch;
}
- debugfs_root = debugfs_create_dir("kho", NULL);
- if (IS_ERR(debugfs_root)) {
- err = -ENOENT;
+ err = kho_debugfs_init();
+ if (err)
goto err_free_fdt;
- }
- err = kho_out_debugfs_init();
+ err = kho_out_debugfs_init(&kho_out.dbg);
if (err)
goto err_free_fdt;
- if (fdt) {
- err = kho_in_debugfs_init(fdt);
- /*
- * Failure to create /sys/kernel/debug/kho/in does not prevent
- * reviving state from KHO and setting up KHO for the next
- * kexec.
- */
- if (err)
- pr_err("failed exposing handover FDT in debugfs: %d\n",
- err);
+ err = kho_out_fdt_setup();
+ if (err)
+ goto err_free_fdt;
+ if (fdt) {
+ kho_in_debugfs_init(&kho_in.dbg, fdt);
return 0;
}
@@ -1432,17 +1374,29 @@ static __init int kho_init(void)
unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
unsigned long pfn;
+ /*
+ * When debug_pagealloc is enabled, __free_pages() clears the
+ * corresponding PRESENT bit in the kernel page table.
+ * Subsequent kmemleak scans of these pages cause the
+ * non-PRESENT page faults.
+ * Mark scratch areas with kmemleak_ignore_phys() to exclude
+ * them from kmemleak scanning.
+ */
+ kmemleak_ignore_phys(kho_scratch[i].addr);
for (pfn = base_pfn; pfn < base_pfn + count;
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
+ kho_out.fdt, true));
+
return 0;
err_free_fdt:
- put_page(kho_out.ser.fdt);
- kho_out.ser.fdt = NULL;
+ kho_unpreserve_free(kho_out.fdt);
err_free_scratch:
+ kho_out.fdt = NULL;
for (int i = 0; i < kho_scratch_cnt; i++) {
void *start = __va(kho_scratch[i].addr);
void *end = start + kho_scratch[i].size;
@@ -1452,7 +1406,7 @@ err_free_scratch:
kho_enable = false;
return err;
}
-late_initcall(kho_init);
+fs_initcall(kho_init);
static void __init kho_release_scratch(void)
{
@@ -1480,16 +1434,12 @@ static void __init kho_release_scratch(void)
void __init kho_memory_init(void)
{
- struct folio *folio;
-
if (kho_in.scratch_phys) {
kho_scratch = phys_to_virt(kho_in.scratch_phys);
kho_release_scratch();
- kho_mem_deserialize(kho_get_fdt());
- folio = kho_restore_folio(kho_in.fdt_phys);
- if (!folio)
- pr_warn("failed to restore folio for KHO fdt\n");
+ if (!kho_mem_deserialize(kho_get_fdt()))
+ kho_in.fdt_phys = 0;
} else {
kho_reserve_scratch();
}
@@ -1545,8 +1495,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
memblock_add(area->addr, size);
err = memblock_mark_kho_scratch(area->addr, size);
if (WARN_ON(err)) {
- pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
- &area->addr, &size, err);
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
+ &area->addr, &size, ERR_PTR(err));
goto out;
}
pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
@@ -1566,7 +1516,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
kho_in.fdt_phys = fdt_phys;
kho_in.scratch_phys = scratch_phys;
kho_scratch_cnt = scratch_cnt;
- pr_info("found kexec handover data. Will skip init for some devices\n");
+ pr_info("found kexec handover data.\n");
out:
if (fdt)
@@ -1585,10 +1535,10 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_out.finalized)
+ if (!kho_enable)
return 0;
- image->kho.fdt = page_to_phys(kho_out.ser.fdt);
+ image->kho.fdt = virt_to_phys(kho_out.fdt);
scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
scratch = (struct kexec_buf){
diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c
index 6efb696f5426..6efb696f5426 100644
--- a/kernel/kexec_handover_debug.c
+++ b/kernel/liveupdate/kexec_handover_debug.c
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
new file mode 100644
index 000000000000..2abbf62ba942
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debugfs.c - kexec handover debugfs interfaces
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include "kexec_handover_internal.h"
+
+static struct dentry *debugfs_root;
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root)
+{
+ struct dentry *dir;
+
+ if (root)
+ dir = dbg->dir;
+ else
+ dir = dbg->sub_fdt_dir;
+
+ return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
+}
+
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
+{
+ struct fdt_debugfs *ff;
+
+ list_for_each_entry(ff, &dbg->fdt_list, list) {
+ if (ff->wrapper.data == fdt) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ break;
+ }
+ }
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ *val = kho_finalized();
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 val)
+{
+ if (val)
+ return kho_finalize();
+ else
+ return -EINVAL;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
+{
+ struct dentry *dir, *sub_fdt_dir;
+ int err, child;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto err_out;
+ }
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node %s prop fdt has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt %s to debugfs: %pe\n", name,
+ ERR_PTR(err));
+ continue;
+ }
+ }
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+
+ return;
+err_rmdir:
+ debugfs_remove_recursive(dir);
+err_out:
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err) {
+ pr_err("failed exposing handover FDT in debugfs: %pe\n",
+ ERR_PTR(err));
+ }
+}
+
+__init int kho_out_debugfs_init(struct kho_debugfs *dbg)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &kho_out_finalize_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+__init int kho_debugfs_init(void)
+{
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root))
+ return -ENOENT;
+ return 0;
+}
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
new file mode 100644
index 000000000000..0202c85ad14f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+#include <linux/debugfs.h>
+
+struct kho_debugfs {
+ struct dentry *dir;
+ struct dentry *sub_fdt_dir;
+ struct list_head fdt_list;
+};
+
+#else
+struct kho_debugfs {};
+#endif
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+bool kho_finalized(void);
+int kho_finalize(void);
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+int kho_debugfs_init(void);
+void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
+int kho_out_debugfs_init(struct kho_debugfs *dbg);
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root);
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
+#else
+static inline int kho_debugfs_init(void) { return 0; }
+static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
+ const void *fdt) { }
+static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
+static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root) { return 0; }
+static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
+ void *fdt) { }
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..f7ecaf7740d1
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/kobject.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
+
+static struct {
+ bool enabled;
+ void *fdt_out;
+ void *fdt_in;
+ u64 liveupdate_num;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+ return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+static int __init luo_early_startup(void)
+{
+ phys_addr_t fdt_phys;
+ int err, ln_size;
+ const void *ptr;
+
+ if (!kho_is_enabled()) {
+ if (liveupdate_enabled())
+ pr_warn("Disabling liveupdate because KHO is disabled\n");
+ luo_global.enabled = false;
+ return 0;
+ }
+
+ /* Retrieve LUO subtree, and verify its format. */
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT) {
+ pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+ LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+ }
+
+ luo_global.fdt_in = phys_to_virt(fdt_phys);
+ err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+ LUO_FDT_COMPATIBLE);
+ if (err) {
+ pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+ LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ ln_size = 0;
+ ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+ &ln_size);
+ if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+ pr_err("Unable to get live update number '%s' [%d]\n",
+ LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+ return -EINVAL;
+ }
+
+ luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+ pr_info("Retrieved live update data, liveupdate number: %lld\n",
+ luo_global.liveupdate_num);
+
+ err = luo_session_setup_incoming(luo_global.fdt_in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+ int err;
+
+ err = luo_early_startup();
+ if (err) {
+ luo_global.enabled = false;
+ luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+ ERR_PTR(err));
+ }
+
+ return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+ const u64 ln = luo_global.liveupdate_num + 1;
+ void *fdt_out;
+ int err;
+
+ fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+ if (IS_ERR(fdt_out)) {
+ pr_err("failed to allocate/preserve FDT memory\n");
+ return PTR_ERR(fdt_out);
+ }
+
+ err = fdt_create(fdt_out, LUO_FDT_SIZE);
+ err |= fdt_finish_reservemap(fdt_out);
+ err |= fdt_begin_node(fdt_out, "");
+ err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+ err |= luo_session_setup_outgoing(fdt_out);
+ err |= fdt_end_node(fdt_out);
+ err |= fdt_finish(fdt_out);
+ if (err)
+ goto exit_free;
+
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ if (err)
+ goto exit_free;
+ luo_global.fdt_out = fdt_out;
+
+ return 0;
+
+exit_free:
+ kho_unpreserve_free(fdt_out);
+ pr_err("failed to prepare LUO FDT: %d\n", err);
+
+ return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_fdt_setup();
+ if (err)
+ luo_global.enabled = false;
+
+ return err;
+}
+late_initcall(luo_late_startup);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_session_serialize();
+ if (err)
+ return err;
+
+ err = kho_finalize();
+ if (err) {
+ pr_err("kho_finalize failed %d\n", err);
+ /*
+ * kho_finalize() may return libfdt errors, to aboid passing to
+ * userspace unknown errors, change this to EAGAIN.
+ */
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+ return luo_global.enabled;
+}
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
+struct luo_device_state {
+ struct miscdevice miscdev;
+ atomic_t in_use;
+};
+
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_create(argp->name, &file);
+ if (err)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_retrieve(argp->name, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+
+ if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+ return -EBUSY;
+
+ /* Always return -EIO to user if deserialization fail */
+ if (luo_session_deserialize()) {
+ atomic_set(&ldev->in_use, 0);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+ atomic_set(&ldev->in_use, 0);
+
+ return 0;
+}
+
+union ucmd_buffer {
+ struct liveupdate_ioctl_create_session create;
+ struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+ struct liveupdate_ioctl_create_session, name),
+ IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+ struct liveupdate_ioctl_retrieve_session, name),
+};
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int err;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_BASE ||
+ (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (err)
+ return err;
+
+ op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (err)
+ return err;
+
+ return op->execute(&ucmd);
+}
+
+static const struct file_operations luo_fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .release = luo_release,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct luo_device_state luo_dev = {
+ .miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &luo_fops,
+ },
+ .in_use = ATOMIC_INIT(0),
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+ if (!liveupdate_enabled())
+ return 0;
+
+ return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
new file mode 100644
index 000000000000..ddff87917b21
--- /dev/null
+++ b/kernel/liveupdate/luo_file.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Descriptors
+ *
+ * LUO provides the infrastructure to preserve specific, stateful file
+ * descriptors across a kexec-based live update. The primary goal is to allow
+ * workloads, such as virtual machines using vfio, memfd, or iommufd, to
+ * retain access to their essential resources without interruption.
+ *
+ * The framework is built around a callback-based handler model and a well-
+ * defined lifecycle for each preserved file.
+ *
+ * Handler Registration:
+ * Kernel modules responsible for a specific file type (e.g., memfd, vfio)
+ * register a &struct liveupdate_file_handler. This handler provides a set of
+ * callbacks that LUO invokes at different stages of the update process, most
+ * notably:
+ *
+ * - can_preserve(): A lightweight check to determine if the handler is
+ * compatible with a given 'struct file'.
+ * - preserve(): The heavyweight operation that saves the file's state and
+ * returns an opaque u64 handle. This is typically performed while the
+ * workload is still active to minimize the downtime during the
+ * actual reboot transition.
+ * - unpreserve(): Cleans up any resources allocated by .preserve(), called
+ * if the preservation process is aborted before the reboot (i.e. session is
+ * closed).
+ * - freeze(): A final pre-reboot opportunity to prepare the state for kexec.
+ * We are already in reboot syscall, and therefore userspace cannot mutate
+ * the file anymore.
+ * - unfreeze(): Undoes the actions of .freeze(), called if the live update
+ * is aborted after the freeze phase.
+ * - retrieve(): Reconstructs the file in the new kernel from the preserved
+ * handle.
+ * - finish(): Performs final check and cleanup in the new kernel. After
+ * succesul finish call, LUO gives up ownership to this file.
+ *
+ * File Preservation Lifecycle happy path:
+ *
+ * 1. Preserve (Normal Operation): A userspace agent preserves files one by one
+ * via an ioctl. For each file, luo_preserve_file() finds a compatible
+ * handler, calls its .preserve() operation, and creates an internal &struct
+ * luo_file to track the live state.
+ *
+ * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called.
+ * It iterates through all preserved files, calls their respective .freeze()
+ * operation, and serializes their final metadata (compatible string, token,
+ * and data handle) into a contiguous memory block for KHO.
+ *
+ * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets
+ * deserialized (which is when /dev/liveupdate is first opened). It reads the
+ * serialized data from the KHO memory region and reconstructs the in-memory
+ * list of &struct luo_file instances for the new kernel, linking them to
+ * their corresponding handlers.
+ *
+ * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now
+ * restore file descriptors by providing a token. luo_retrieve_file()
+ * searches for the matching token, calls the handler's .retrieve() op to
+ * re-create the 'struct file', and returns a new FD. Files can be
+ * retrieved in ANY order.
+ *
+ * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete,
+ * luo_file_finish() is called. It iterates through all files, invokes their
+ * .finish() operations for final cleanup, and releases all associated kernel
+ * resources.
+ *
+ * File Preservation Lifecycle unhappy paths:
+ *
+ * 1. Abort Before Reboot: If the userspace agent aborts the live update
+ * process before calling reboot (e.g., by closing the session file
+ * descriptor), the session's release handler calls
+ * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on
+ * all preserved files, ensuring all allocated resources are cleaned up and
+ * returning the system to a clean state.
+ *
+ * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze()
+ * op fails, the .unfreeze() op is invoked on all previously *successful*
+ * freezes to roll back their state. The reboot() syscall then returns an
+ * error to userspace, canceling the live update.
+ *
+ * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails,
+ * the luo_file_finish() operation is aborted. LUO retains ownership of
+ * all files within that session, including those that were not yet
+ * processed. The userspace agent can attempt to call the finish operation
+ * again later. If the issue cannot be resolved, these resources will be held
+ * by LUO until the next live update cycle, at which point they will be
+ * discarded.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "luo_internal.h"
+
+static LIST_HEAD(luo_file_handler_list);
+
+/* 2 4K pages, give space for 128 files per file_set */
+#define LUO_FILE_PGCNT 2ul
+#define LUO_FILE_MAX \
+ ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
+
+/**
+ * struct luo_file - Represents a single preserved file instance.
+ * @fh: Pointer to the &struct liveupdate_file_handler that manages
+ * this type of file.
+ * @file: Pointer to the kernel's &struct file that is being preserved.
+ * This is NULL in the new kernel until the file is successfully
+ * retrieved.
+ * @serialized_data: The opaque u64 handle to the serialized state of the file.
+ * This handle is passed back to the handler's .freeze(),
+ * .retrieve(), and .finish() callbacks, allowing it to track
+ * and update its serialized state across phases.
+ * @private_data: Pointer to the private data for the file used to hold runtime
+ * state that is not preserved. Set by the handler's .preserve()
+ * callback, and must be freed in the handler's .unpreserve()
+ * callback.
+ * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * successfully called retrieve() on this file. This prevents
+ * multiple retrieval attempts.
+ * @mutex: A mutex that protects the fields of this specific instance
+ * (e.g., @retrieved, @file), ensuring that operations like
+ * retrieving or finishing a file are atomic.
+ * @list: The list_head linking this instance into its parent
+ * file_set's list of preserved files.
+ * @token: The user-provided unique token used to identify this file.
+ *
+ * This structure is the core in-kernel representation of a single file being
+ * managed through a live update. An instance is created by luo_preserve_file()
+ * to link a 'struct file' to its corresponding handler, a user-provided token,
+ * and the serialized state handle returned by the handler's .preserve()
+ * operation.
+ *
+ * These instances are tracked in a per-file_set list. The @serialized_data
+ * field, which holds a handle to the file's serialized state, may be updated
+ * during the .freeze() callback before being serialized for the next kernel.
+ * After reboot, these structures are recreated by luo_file_deserialize() and
+ * are finally cleaned up by luo_file_finish().
+ */
+struct luo_file {
+ struct liveupdate_file_handler *fh;
+ struct file *file;
+ u64 serialized_data;
+ void *private_data;
+ bool retrieved;
+ struct mutex mutex;
+ struct list_head list;
+ u64 token;
+};
+
+static int luo_alloc_files_mem(struct luo_file_set *file_set)
+{
+ size_t size;
+ void *mem;
+
+ if (file_set->files)
+ return 0;
+
+ WARN_ON_ONCE(file_set->count);
+
+ size = LUO_FILE_PGCNT << PAGE_SHIFT;
+ mem = kho_alloc_preserve(size);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ file_set->files = mem;
+
+ return 0;
+}
+
+static void luo_free_files_mem(struct luo_file_set *file_set)
+{
+ /* If file_set has files, no need to free preservation memory */
+ if (file_set->count)
+ return;
+
+ if (!file_set->files)
+ return;
+
+ kho_unpreserve_free(file_set->files);
+ file_set->files = NULL;
+}
+
+static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
+{
+ struct luo_file *iter;
+
+ list_for_each_entry(iter, &file_set->files_list, list) {
+ if (iter->token == token)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * luo_preserve_file - Initiate the preservation of a file descriptor.
+ * @file_set: The file_set to which the preserved file will be added.
+ * @token: A unique, user-provided identifier for the file.
+ * @fd: The file descriptor to be preserved.
+ *
+ * This function orchestrates the first phase of preserving a file. Upon entry,
+ * it takes a reference to the 'struct file' via fget(), effectively making LUO
+ * a co-owner of the file. This reference is held until the file is either
+ * unpreserved or successfully finished in the next kernel, preventing the file
+ * from being prematurely destroyed.
+ *
+ * This function orchestrates the first phase of preserving a file. It performs
+ * the following steps:
+ *
+ * 1. Validates that the @token is not already in use within the file_set.
+ * 2. Ensures the file_set's memory for files serialization is allocated
+ * (allocates if needed).
+ * 3. Iterates through registered handlers, calling can_preserve() to find one
+ * compatible with the given @fd.
+ * 4. Calls the handler's .preserve() operation, which saves the file's state
+ * and returns an opaque private data handle.
+ * 5. Adds the new instance to the file_set's internal list.
+ *
+ * On success, LUO takes a reference to the 'struct file' and considers it
+ * under its management until it is unpreserved or finished.
+ *
+ * In case of any failure, all intermediate allocations (file reference, memory
+ * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
+ *
+ * Context: Can be called from an ioctl handler during normal system operation.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EEXIST if the token is already used.
+ * -EBADF if the file descriptor is invalid.
+ * -ENOSPC if the file_set is full.
+ * -ENOENT if no compatible handler is found.
+ * -ENOMEM on memory allocation failure.
+ * Other erros might be returned by .preserve().
+ */
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct liveupdate_file_handler *fh;
+ struct luo_file *luo_file;
+ struct file *file;
+ int err;
+
+ if (luo_token_is_used(file_set, token))
+ return -EEXIST;
+
+ if (file_set->count == LUO_FILE_MAX)
+ return -ENOSPC;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ err = luo_alloc_files_mem(file_set);
+ if (err)
+ goto err_fput;
+
+ err = -ENOENT;
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (fh->ops->can_preserve(fh, file)) {
+ err = 0;
+ break;
+ }
+ }
+
+ /* err is still -ENOENT if no handler was found */
+ if (err)
+ goto err_free_files_mem;
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file) {
+ err = -ENOMEM;
+ goto err_free_files_mem;
+ }
+
+ luo_file->file = file;
+ luo_file->fh = fh;
+ luo_file->token = token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+
+ args.handler = fh;
+ args.file = file;
+ err = fh->ops->preserve(&args);
+ if (err)
+ goto err_kfree;
+
+ luo_file->serialized_data = args.serialized_data;
+ luo_file->private_data = args.private_data;
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ file_set->count++;
+
+ return 0;
+
+err_kfree:
+ kfree(luo_file);
+err_free_files_mem:
+ luo_free_files_mem(file_set);
+err_fput:
+ fput(file);
+
+ return err;
+}
+
+/**
+ * luo_file_unpreserve_files - Unpreserves all files from a file_set.
+ * @file_set: The files to be cleaned up.
+ *
+ * This function serves as the primary cleanup path for a file_set. It is
+ * invoked when the userspace agent closes the file_set's file descriptor.
+ *
+ * For each file, it performs the following cleanup actions:
+ * 1. Calls the handler's .unpreserve() callback to allow the handler to
+ * release any resources it allocated.
+ * 2. Removes the file from the file_set's internal tracking list.
+ * 3. Releases the reference to the 'struct file' that was taken by
+ * luo_preserve_file() via fput(), returning ownership.
+ * 4. Frees the memory associated with the internal 'struct luo_file'.
+ *
+ * After all individual files are unpreserved, it frees the contiguous memory
+ * block that was allocated to hold their serialization data.
+ */
+void luo_file_unpreserve_files(struct luo_file_set *file_set)
+{
+ struct luo_file *luo_file;
+
+ while (!list_empty(&file_set->files_list)) {
+ struct liveupdate_file_op_args args = {0};
+
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+ luo_file->fh->ops->unpreserve(&args);
+
+ list_del(&luo_file->list);
+ file_set->count--;
+
+ fput(luo_file->file);
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ luo_free_files_mem(file_set);
+}
+
+static int luo_file_freeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ int err = 0;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->freeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ err = luo_file->fh->ops->freeze(&args);
+ if (!err)
+ luo_file->serialized_data = args.serialized_data;
+ }
+
+ return err;
+}
+
+static void luo_file_unfreeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->unfreeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ luo_file->fh->ops->unfreeze(&args);
+ }
+
+ luo_file->serialized_data = 0;
+}
+
+static void __luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file *failed_entry)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ if (luo_file == failed_entry)
+ break;
+
+ luo_file_unfreeze_one(file_set, luo_file);
+ }
+
+ memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT);
+}
+
+/**
+ * luo_file_freeze - Freezes all preserved files and serializes their metadata.
+ * @file_set: The file_set whose files are to be frozen.
+ * @file_set_ser: Where to put the serialized file_set.
+ *
+ * This function is called from the reboot() syscall path, just before the
+ * kernel transitions to the new image via kexec. Its purpose is to perform the
+ * final preparation and serialization of all preserved files in the file_set.
+ *
+ * It iterates through each preserved file in FIFO order (the order of
+ * preservation) and performs two main actions:
+ *
+ * 1. Freezes the File: It calls the handler's .freeze() callback for each
+ * file. This gives the handler a final opportunity to quiesce the device or
+ * prepare its state for the upcoming reboot. The handler may update its
+ * private data handle during this step.
+ *
+ * 2. Serializes Metadata: After a successful freeze, it copies the final file
+ * metadata—the handler's compatible string, the user token, and the final
+ * private data handle—into the pre-allocated contiguous memory buffer
+ * (file_set->files) that will be handed over to the next kernel via KHO.
+ *
+ * Error Handling (Rollback):
+ * This function is atomic. If any handler's .freeze() operation fails, the
+ * entire live update is aborted. The __luo_file_unfreeze() helper is
+ * immediately called to invoke the .unfreeze() op on all files that were
+ * successfully frozen before the point of failure, rolling them back to a
+ * running state. The function then returns an error, causing the reboot()
+ * syscall to fail.
+ *
+ * Context: Called only from the liveupdate_reboot() path.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser = file_set->files;
+ struct luo_file *luo_file;
+ int err;
+ int i;
+
+ if (!file_set->count)
+ return 0;
+
+ if (WARN_ON(!file_ser))
+ return -EINVAL;
+
+ i = 0;
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ err = luo_file_freeze_one(file_set, luo_file);
+ if (err < 0) {
+ pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n",
+ luo_file->token, luo_file->fh->compatible,
+ ERR_PTR(err));
+ goto err_unfreeze;
+ }
+
+ strscpy(file_ser[i].compatible, luo_file->fh->compatible,
+ sizeof(file_ser[i].compatible));
+ file_ser[i].data = luo_file->serialized_data;
+ file_ser[i].token = luo_file->token;
+ i++;
+ }
+
+ file_set_ser->count = file_set->count;
+ if (file_set->files)
+ file_set_ser->files = virt_to_phys(file_set->files);
+
+ return 0;
+
+err_unfreeze:
+ __luo_file_unfreeze(file_set, luo_file);
+
+ return err;
+}
+
+/**
+ * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization
+ * @file_set: The file_set whose files are to be unfrozen.
+ * @file_set_ser: Serialized file_set.
+ *
+ * This function rolls back the state of all files in a file_set after the
+ * freeze phase has begun but must be aborted. It is the counterpart to
+ * luo_file_freeze().
+ *
+ * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
+ * signals the helper to iterate through all files in the file_set and call
+ * their respective .unfreeze() handler callbacks.
+ *
+ * Context: This is called when the live update is aborted during
+ * the reboot() syscall, after luo_file_freeze() has been called.
+ */
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ if (!file_set->count)
+ return;
+
+ __luo_file_unfreeze(file_set, NULL);
+ memset(file_set_ser, 0, sizeof(*file_set_ser));
+}
+
+/**
+ * luo_retrieve_file - Restores a preserved file from a file_set by its token.
+ * @file_set: The file_set from which to retrieve the file.
+ * @token: The unique token identifying the file to be restored.
+ * @filep: Output parameter; on success, this is populated with a pointer
+ * to the newly retrieved 'struct file'.
+ *
+ * This function is the primary mechanism for recreating a file in the new
+ * kernel after a live update. It searches the file_set's list of deserialized
+ * files for an entry matching the provided @token.
+ *
+ * The operation is idempotent: if a file has already been successfully
+ * retrieved, this function will simply return a pointer to the existing
+ * 'struct file' and report success without re-executing the retrieve
+ * operation. This is handled by checking the 'retrieved' flag under a lock.
+ *
+ * File retrieval can happen in any order; it is not bound by the order of
+ * preservation.
+ *
+ * Context: Can be called from an ioctl or other in-kernel code in the new
+ * kernel.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -ENOENT if no file with the matching token is found.
+ * Any error code returned by the handler's .retrieve() op.
+ */
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct luo_file *luo_file;
+ int err;
+
+ if (list_empty(&file_set->files_list))
+ return -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->token == token)
+ break;
+ }
+
+ if (luo_file->token != token)
+ return -ENOENT;
+
+ guard(mutex)(&luo_file->mutex);
+ if (luo_file->retrieved) {
+ /*
+ * Someone is asking for this file again, so get a reference
+ * for them.
+ */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ return 0;
+ }
+
+ args.handler = luo_file->fh;
+ args.serialized_data = luo_file->serialized_data;
+ err = luo_file->fh->ops->retrieve(&args);
+ if (!err) {
+ luo_file->file = args.file;
+
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieved = true;
+ }
+
+ return err;
+}
+
+static int luo_file_can_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ bool can_finish = true;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->can_finish) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+ can_finish = luo_file->fh->ops->can_finish(&args);
+ }
+
+ return can_finish ? 0 : -EBUSY;
+}
+
+static void luo_file_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ struct liveupdate_file_op_args args = {0};
+
+ guard(mutex)(&luo_file->mutex);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+
+ luo_file->fh->ops->finish(&args);
+}
+
+/**
+ * luo_file_finish - Completes the lifecycle for all files in a file_set.
+ * @file_set: The file_set to be finalized.
+ *
+ * This function orchestrates the final teardown of a live update file_set in
+ * the new kernel. It should be called after all necessary files have been
+ * retrieved and the userspace agent is ready to release the preserved state.
+ *
+ * The function iterates through all tracked files. For each file, it performs
+ * the following sequence of cleanup actions:
+ *
+ * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
+ * every file in the file_set. If all can_finish return true, continue to
+ * finish.
+ * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
+ * allow for final resource cleanup within the handler.
+ * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
+ * is the counterpart to the get_file() call in luo_retrieve_file().
+ * 4. Removes the 'struct luo_file' from the file_set's internal list.
+ * 5. Frees the memory for the 'struct luo_file' instance itself.
+ *
+ * After successfully finishing all individual files, it frees the
+ * contiguous memory block that was used to transfer the serialized metadata
+ * from the previous kernel.
+ *
+ * Error Handling (Atomic Failure):
+ * This operation is atomic. If any handler's .can_finish() op fails, the entire
+ * function aborts immediately and returns an error.
+ *
+ * Context: Can be called from an ioctl handler in the new kernel.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_finish(struct luo_file_set *file_set)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+ int err;
+
+ if (!file_set->count)
+ return 0;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ err = luo_file_can_finish_one(file_set, luo_file);
+ if (err)
+ return err;
+ }
+
+ while (!list_empty(&file_set->files_list)) {
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ luo_file_finish_one(file_set, luo_file);
+
+ if (luo_file->file)
+ fput(luo_file->file);
+ list_del(&luo_file->list);
+ file_set->count--;
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ if (file_set->files) {
+ kho_restore_free(file_set->files);
+ file_set->files = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
+ * @file_set: The incoming file_set to fill with deserialized data.
+ * @file_set_ser: Serialized KHO file_set data from the previous kernel.
+ *
+ * This function is called during the early boot process of the new kernel. It
+ * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
+ * provided by the previous kernel, and transforms it back into a live,
+ * in-memory linked list of 'struct luo_file' instances.
+ *
+ * For each serialized entry, it performs the following steps:
+ * 1. Reads the 'compatible' string.
+ * 2. Searches the global list of registered file handlers for one that
+ * matches the compatible string.
+ * 3. Allocates a new 'struct luo_file'.
+ * 4. Populates the new structure with the deserialized data (token, private
+ * data handle) and links it to the found handler. The 'file' pointer is
+ * initialized to NULL, as the file has not been retrieved yet.
+ * 5. Adds the new 'struct luo_file' to the file_set's files_list.
+ *
+ * This prepares the file_set for userspace, which can later call
+ * luo_retrieve_file() to restore the actual file descriptors.
+ *
+ * Context: Called from session deserialization.
+ */
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser;
+ u64 i;
+
+ if (!file_set_ser->files) {
+ WARN_ON(file_set_ser->count);
+ return 0;
+ }
+
+ file_set->count = file_set_ser->count;
+ file_set->files = phys_to_virt(file_set_ser->files);
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of files that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ file_ser = file_set->files;
+ for (i = 0; i < file_set->count; i++) {
+ struct liveupdate_file_handler *fh;
+ bool handler_found = false;
+ struct luo_file *luo_file;
+
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (!strcmp(fh->compatible, file_ser[i].compatible)) {
+ handler_found = true;
+ break;
+ }
+ }
+
+ if (!handler_found) {
+ pr_warn("No registered handler for compatible '%s'\n",
+ file_ser[i].compatible);
+ return -ENOENT;
+ }
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file)
+ return -ENOMEM;
+
+ luo_file->fh = fh;
+ luo_file->file = NULL;
+ luo_file->serialized_data = file_ser[i].data;
+ luo_file->token = file_ser[i].token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ }
+
+ return 0;
+}
+
+void luo_file_set_init(struct luo_file_set *file_set)
+{
+ INIT_LIST_HEAD(&file_set->files_list);
+}
+
+void luo_file_set_destroy(struct luo_file_set *file_set)
+{
+ WARN_ON(file_set->count);
+ WARN_ON(!list_empty(&file_set->files_list));
+}
+
+/**
+ * liveupdate_register_file_handler - Register a file handler with LUO.
+ * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
+ * The caller must initialize this structure, including a unique
+ * 'compatible' string and a valid 'fh' callbacks. This function adds the
+ * handler to the global list of supported file handlers.
+ *
+ * Context: Typically called during module initialization for file types that
+ * support live update preservation.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+ struct liveupdate_file_handler *fh_iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /* Sanity check that all required callbacks are set */
+ if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve ||
+ !fh->ops->finish || !fh->ops->can_preserve) {
+ return -EINVAL;
+ }
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check for duplicate compatible strings */
+ luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ if (!strcmp(fh_iter->compatible, fh->compatible)) {
+ pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
+ fh->compatible);
+ err = -EEXIST;
+ goto err_resume;
+ }
+ }
+
+ /* Pin the module implementing the handler */
+ if (!try_module_get(fh->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
+ list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
+ * @fh: The file handler to unregister
+ *
+ * Unregisters the file handler from the liveupdate core. This function
+ * reverses the operations of liveupdate_register_file_handler().
+ *
+ * It ensures safe removal by checking that:
+ * No live update session is currently in progress.
+ *
+ * If the unregistration fails, the internal test state is reverted.
+ *
+ * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
+ * update is in progress, can't quiesce live update.
+ */
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ list_del(&ACCESS_PRIVATE(fh, list));
+ module_put(fh->ops->owner);
+ luo_session_resume();
+
+ return 0;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..c8973b543d1d
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+ size_t kernel_cmd_size)
+{
+ /*
+ * Copy the minimum of what the user provided and what we actually
+ * have.
+ */
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+/* Mimics list_for_each_entry() but for private list head entries */
+#define luo_list_for_each_private(pos, head, member) \
+ for (struct list_head *__iter = (head)->next; \
+ __iter != (head) && \
+ ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
+ __iter = __iter->next)
+
+/**
+ * struct luo_file_set - A set of files that belong to the same sessions.
+ * @files_list: An ordered list of files associated with this session, it is
+ * ordered by preservation time.
+ * @files: The physically contiguous memory block that holds the serialized
+ * state of files.
+ * @count: A counter tracking the number of files currently stored in the
+ * @files_list for this session.
+ */
+struct luo_file_set {
+ struct list_head files_list;
+ struct luo_file_ser *files;
+ long count;
+};
+
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name: A unique name for this session, used for identification and
+ * retrieval.
+ * @ser: Pointer to the serialized data for this session.
+ * @list: A list_head member used to link this session into a global list
+ * of either outgoing (to be preserved) or incoming (restored from
+ * previous kernel) sessions.
+ * @retrieved: A boolean flag indicating whether this session has been
+ * retrieved by a consumer in the new kernel.
+ * @file_set: A set of files that belong to this session.
+ * @mutex: protects fields in the luo_session.
+ */
+struct luo_session {
+ char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+ struct luo_session_ser *ser;
+ struct list_head list;
+ bool retrieved;
+ struct luo_file_set file_set;
+ struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+void luo_file_unpreserve_files(struct luo_file_set *file_set);
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep);
+int luo_file_finish(struct luo_file_set *file_set);
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_set_init(struct luo_file_set *file_set);
+void luo_file_set_destroy(struct luo_file_set *file_set);
+
+#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..dbdbc3bd7929
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ * which is used for both creation in the current kernel and retrieval in the
+ * next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ * ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ * a live update is triggered via kexec, an array of `struct luo_session_ser`
+ * is populated and placed in a preserved memory region. An FDT node is also
+ * created, containing the count of sessions and the physical address of this
+ * array.
+ *
+ * Session Lifecycle:
+ *
+ * 1. Creation: A userspace agent calls `luo_session_create()` to create a
+ * new, empty session and receives a file descriptor for it.
+ *
+ * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ * made, `luo_session_serialize()` is called. It iterates through all
+ * active sessions and writes their metadata into a memory area preserved
+ * by KHO.
+ *
+ * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ * runs, reading the serialized data and creating a list of `struct
+ * luo_session` objects representing the preserved sessions.
+ *
+ * 4. Retrieval: A userspace agent in the new kernel can then call
+ * `luo_session_retrieve()` with a session name to get a new file
+ * descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT 16ul
+#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_session_header_ser)) / \
+ sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count: The number of sessions currently tracked in the @list.
+ * @list: The head of the linked list of `struct luo_session` instances.
+ * @rwsem: A read-write semaphore providing synchronized access to the
+ * session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser: The serialized session data (an array of
+ * `struct luo_session_ser`).
+ * @active: Set to true when first initialized. If previous kernel did not
+ * send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+ long count;
+ struct list_head list;
+ struct rw_semaphore rwsem;
+ struct luo_session_header_ser *header_ser;
+ struct luo_session_ser *ser;
+ bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming: The sessions passed from the previous kernel.
+ * @outgoing: The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+ struct luo_session_header incoming;
+ struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+ .incoming = {
+ .list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+ },
+ .outgoing = {
+ .list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+ },
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+ struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ strscpy(session->name, name, sizeof(session->name));
+ INIT_LIST_HEAD(&session->file_set.files_list);
+ luo_file_set_init(&session->file_set);
+ INIT_LIST_HEAD(&session->list);
+ mutex_init(&session->mutex);
+
+ return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+ luo_file_set_destroy(&session->file_set);
+ mutex_destroy(&session->mutex);
+ kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ struct luo_session *it;
+
+ guard(rwsem_write)(&sh->rwsem);
+
+ /*
+ * For outgoing we should make sure there is room in serialization array
+ * for new session.
+ */
+ if (sh == &luo_session_global.outgoing) {
+ if (sh->count == LUO_SESSION_MAX)
+ return -ENOMEM;
+ }
+
+ /*
+ * For small number of sessions this loop won't hurt performance
+ * but if we ever start using a lot of sessions, this might
+ * become a bottle neck during deserialization time, as it would
+ * cause O(n*n) complexity.
+ */
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, session->name, sizeof(it->name)))
+ return -EEXIST;
+ }
+ list_add_tail(&session->list, &sh->list);
+ sh->count++;
+
+ return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ guard(rwsem_write)(&sh->rwsem);
+ list_del(&session->list);
+ sh->count--;
+}
+
+static int luo_session_finish_one(struct luo_session *session)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_finish(&session->file_set);
+}
+
+static void luo_session_unfreeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ luo_file_unfreeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_freeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_freeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_session *session = filep->private_data;
+ struct luo_session_header *sh;
+
+ /* If retrieved is set, it means this session is from incoming list */
+ if (session->retrieved) {
+ int err = luo_session_finish_one(session);
+
+ if (err) {
+ pr_warn("Unable to finish session [%s] on release\n",
+ session->name);
+ return err;
+ }
+ sh = &luo_session_global.incoming;
+ } else {
+ scoped_guard(mutex, &session->mutex)
+ luo_file_unpreserve_files(&session->file_set);
+ sh = &luo_session_global.outgoing;
+ }
+
+ luo_session_remove(sh, session);
+ luo_session_free(session);
+
+ return 0;
+}
+
+static int luo_session_preserve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_preserve_fd *argp = ucmd->cmd;
+ int err;
+
+ guard(mutex)(&session->mutex);
+ err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+ if (err)
+ return err;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ pr_warn("The file was successfully preserved, but response to user failed\n");
+
+ return err;
+}
+
+static int luo_session_retrieve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_retrieve_fd *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ guard(mutex)(&session->mutex);
+ err = luo_retrieve_file(&session->file_set, argp->token, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_session_finish(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_finish *argp = ucmd->cmd;
+ int err = luo_session_finish_one(session);
+
+ if (err)
+ return err;
+
+ return luo_ucmd_respond(ucmd, sizeof(*argp));
+}
+
+union ucmd_buffer {
+ struct liveupdate_session_finish finish;
+ struct liveupdate_session_preserve_fd preserve;
+ struct liveupdate_session_retrieve_fd retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_session_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish,
+ struct liveupdate_session_finish, reserved),
+ IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd,
+ struct liveupdate_session_preserve_fd, token),
+ IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd,
+ struct liveupdate_session_retrieve_fd, token),
+};
+
+static long luo_session_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct luo_session *session = filep->private_data;
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >=
+ ARRAY_SIZE(luo_session_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+
+ return op->execute(session, &ucmd);
+}
+
+static const struct file_operations luo_session_fops = {
+ .owner = THIS_MODULE,
+ .release = luo_session_release,
+ .unlocked_ioctl = luo_session_ioctl,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+ char name_buf[128];
+ struct file *file;
+
+ lockdep_assert_held(&session->mutex);
+ snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+ file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ *filep = file;
+
+ return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+ struct luo_session *session;
+ int err;
+
+ session = luo_session_alloc(name);
+ if (IS_ERR(session))
+ return PTR_ERR(session);
+
+ err = luo_session_insert(&luo_session_global.outgoing, session);
+ if (err)
+ goto err_free;
+
+ scoped_guard(mutex, &session->mutex)
+ err = luo_session_getfile(session, filep);
+ if (err)
+ goto err_remove;
+
+ return 0;
+
+err_remove:
+ luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+ luo_session_free(session);
+
+ return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ struct luo_session *session = NULL;
+ struct luo_session *it;
+ int err;
+
+ scoped_guard(rwsem_read, &sh->rwsem) {
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, name, sizeof(it->name))) {
+ session = it;
+ break;
+ }
+ }
+ }
+
+ if (!session)
+ return -ENOENT;
+
+ guard(mutex)(&session->mutex);
+ if (session->retrieved)
+ return -EINVAL;
+
+ err = luo_session_getfile(session, filep);
+ if (!err)
+ session->retrieved = true;
+
+ return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+ struct luo_session_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_SESSION_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ luo_session_global.outgoing.header_ser = header_ser;
+ luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_session_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+ return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+ struct luo_session_header_ser *header_ser;
+ int err, header_size, offset;
+ u64 header_ser_pa;
+ const void *ptr;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get session node: [%s]\n",
+ LUO_FDT_SESSION_NODE_NAME);
+ return -EINVAL;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_SESSION_COMPATIBLE);
+ if (err) {
+ pr_err("Session node incompatible [%s]\n",
+ LUO_FDT_SESSION_COMPATIBLE);
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get session header '%s' [%d]\n",
+ LUO_FDT_SESSION_HEADER, header_size);
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_session_global.incoming.header_ser = header_ser;
+ luo_session_global.incoming.ser = (void *)(header_ser + 1);
+ luo_session_global.incoming.active = true;
+
+ return 0;
+}
+
+int luo_session_deserialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ static bool is_deserialized;
+ static int err;
+
+ /* If has been deserialized, always return the same error code */
+ if (is_deserialized)
+ return err;
+
+ is_deserialized = true;
+ if (!sh->active)
+ return 0;
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of sessions that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ for (int i = 0; i < sh->header_ser->count; i++) {
+ struct luo_session *session;
+
+ session = luo_session_alloc(sh->ser[i].name);
+ if (IS_ERR(session)) {
+ pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+ sh->ser[i].name, session);
+ return PTR_ERR(session);
+ }
+
+ err = luo_session_insert(sh, session);
+ if (err) {
+ pr_warn("Failed to insert session [%s] %pe\n",
+ session->name, ERR_PTR(err));
+ luo_session_free(session);
+ return err;
+ }
+
+ scoped_guard(mutex, &session->mutex) {
+ luo_file_deserialize(&session->file_set,
+ &sh->ser[i].file_set_ser);
+ }
+ }
+
+ kho_restore_free(sh->header_ser);
+ sh->header_ser = NULL;
+ sh->ser = NULL;
+
+ return 0;
+}
+
+int luo_session_serialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.outgoing;
+ struct luo_session *session;
+ int i = 0;
+ int err;
+
+ guard(rwsem_write)(&sh->rwsem);
+ list_for_each_entry(session, &sh->list, list) {
+ err = luo_session_freeze_one(session, &sh->ser[i]);
+ if (err)
+ goto err_undo;
+
+ strscpy(sh->ser[i].name, session->name,
+ sizeof(sh->ser[i].name));
+ i++;
+ }
+ sh->header_ser->count = sh->count;
+
+ return 0;
+
+err_undo:
+ list_for_each_entry_continue_reverse(session, &sh->list, list) {
+ i--;
+ luo_session_unfreeze_one(session, &sh->ser[i]);
+ memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name));
+ }
+
+ return err;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ *
+ * Return:
+ * true - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+ down_write(&luo_session_global.incoming.rwsem);
+ down_write(&luo_session_global.outgoing.rwsem);
+
+ if (luo_session_global.incoming.count ||
+ luo_session_global.outgoing.count) {
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ce0362f0a871..6567e5eeacc0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -103,8 +103,8 @@ static const struct kernel_param_ops lt_bind_ops = {
.get = param_get_cpumask,
};
-module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
-module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0444);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0444);
long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn);
@@ -1211,6 +1211,10 @@ end:
cxt.cur_ops->exit();
cxt.init_called = false;
}
+
+ free_cpumask_var(bind_readers);
+ free_cpumask_var(bind_writers);
+
torture_cleanup_end();
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7b3ec2fa6e7c..710ee30b3bea 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf)
int i;
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &taints))
+ if (test_bit(i, &taints))
buf[l++] = taint_flags[i].c_true;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index b2f2470af7e5..0d52210a9e2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & SYS_INFO_ALL_CPU_BT)
+ if (panic_print & SYS_INFO_ALL_BT)
panic_trigger_all_cpu_backtrace();
/*
@@ -628,38 +628,40 @@ void panic(const char *fmt, ...)
}
EXPORT_SYMBOL(panic);
-#define TAINT_FLAG(taint, _c_true, _c_false, _module) \
+#define TAINT_FLAG(taint, _c_true, _c_false) \
[ TAINT_##taint ] = { \
.c_true = _c_true, .c_false = _c_false, \
- .module = _module, \
.desc = #taint, \
}
/*
- * TAINT_FORCED_RMMOD could be a per-module flag but the module
- * is being removed anyway.
+ * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
+ * please also modify tools/debugging/kernel-chktaint and
+ * Documentation/admin-guide/tainted-kernels.rst, including its
+ * small shell script that prints the TAINT_FLAGS_COUNT bits of
+ * /proc/sys/kernel/tainted.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true),
- TAINT_FLAG(FORCED_MODULE, 'F', ' ', true),
- TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false),
- TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false),
- TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false),
- TAINT_FLAG(BAD_PAGE, 'B', ' ', false),
- TAINT_FLAG(USER, 'U', ' ', false),
- TAINT_FLAG(DIE, 'D', ' ', false),
- TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false),
- TAINT_FLAG(WARN, 'W', ' ', false),
- TAINT_FLAG(CRAP, 'C', ' ', true),
- TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false),
- TAINT_FLAG(OOT_MODULE, 'O', ' ', true),
- TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true),
- TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false),
- TAINT_FLAG(LIVEPATCH, 'K', ' ', true),
- TAINT_FLAG(AUX, 'X', ' ', true),
- TAINT_FLAG(RANDSTRUCT, 'T', ' ', true),
- TAINT_FLAG(TEST, 'N', ' ', true),
- TAINT_FLAG(FWCTL, 'J', ' ', true),
+ TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'),
+ TAINT_FLAG(FORCED_MODULE, 'F', ' '),
+ TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '),
+ TAINT_FLAG(FORCED_RMMOD, 'R', ' '),
+ TAINT_FLAG(MACHINE_CHECK, 'M', ' '),
+ TAINT_FLAG(BAD_PAGE, 'B', ' '),
+ TAINT_FLAG(USER, 'U', ' '),
+ TAINT_FLAG(DIE, 'D', ' '),
+ TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '),
+ TAINT_FLAG(WARN, 'W', ' '),
+ TAINT_FLAG(CRAP, 'C', ' '),
+ TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '),
+ TAINT_FLAG(OOT_MODULE, 'O', ' '),
+ TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '),
+ TAINT_FLAG(SOFTLOCKUP, 'L', ' '),
+ TAINT_FLAG(LIVEPATCH, 'K', ' '),
+ TAINT_FLAG(AUX, 'X', ' '),
+ TAINT_FLAG(RANDSTRUCT, 'T', ' '),
+ TAINT_FLAG(TEST, 'N', ' '),
+ TAINT_FLAG(FWCTL, 'J', ' '),
};
#undef TAINT_FLAG
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index f72bbfa266d6..5f5f626f4279 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,7 +3,6 @@
* internal.h - printk internal definitions
*/
#include <linux/console.h>
-#include <linux/percpu.h>
#include <linux/types.h>
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
@@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con);
void nbcon_kthread_stop(struct console *con);
void nbcon_kthreads_wake(void);
-/*
- * Check if the given console is currently capable and allowed to print
- * records. Note that this function does not consider the current context,
- * which can also play a role in deciding if @con can be used to print
- * records.
- */
-static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
-{
- if (!(flags & CON_ENABLED))
- return false;
-
- if ((flags & CON_SUSPENDED))
- return false;
-
- if (flags & CON_NBCON) {
- /* The write_atomic() callback is optional. */
- if (use_atomic && !con->write_atomic)
- return false;
-
- /*
- * For the !use_atomic case, @printk_kthreads_running is not
- * checked because the write_thread() callback is also used
- * via the legacy loop when the printer threads are not
- * available.
- */
- } else {
- if (!con->write)
- return false;
- }
-
- /*
- * Console drivers may assume that per-cpu resources have been
- * allocated. So unless they're explicitly marked as being able to
- * cope (CON_ANYTIME) don't call them until this CPU is officially up.
- */
- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
- return false;
-
- return true;
-}
-
/**
* nbcon_kthread_wake - Wake up a console printing thread
* @con: Console to operate on
@@ -204,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand
static inline void nbcon_kthread_wake(struct console *con) { }
static inline void nbcon_kthreads_wake(void) { }
-static inline bool console_is_usable(struct console *con, short flags,
- bool use_atomic) { return false; }
-
#endif /* CONFIG_PRINTK */
extern bool have_boot_console;
@@ -230,6 +185,8 @@ struct console_flush_type {
bool legacy_offload;
};
+extern bool console_irqwork_blocked;
+
/*
* Identify which console flushing methods should be used in the context of
* the caller.
@@ -241,7 +198,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft)
switch (nbcon_get_default_prio()) {
case NBCON_PRIO_NORMAL:
if (have_nbcon_console && !have_boot_console) {
- if (printk_kthreads_running)
+ if (printk_kthreads_running && !console_irqwork_blocked)
ft->nbcon_offload = true;
else
ft->nbcon_atomic = true;
@@ -251,7 +208,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft)
if (have_legacy_console || have_boot_console) {
if (!is_printk_legacy_deferred())
ft->legacy_direct = true;
- else
+ else if (!console_irqwork_blocked)
ft->legacy_offload = true;
}
break;
@@ -264,7 +221,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft)
if (have_legacy_console || have_boot_console) {
if (!is_printk_legacy_deferred())
ft->legacy_direct = true;
- else
+ else if (!console_irqwork_blocked)
ft->legacy_offload = true;
}
break;
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 558ef3177976..3fa403f9831f 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -10,6 +10,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/irqflags.h>
+#include <linux/kdb.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
#include <linux/panic.h>
@@ -118,6 +119,9 @@
* from scratch.
*/
+/* Counter of active nbcon emergency contexts. */
+static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0);
+
/**
* nbcon_state_set - Helper function to set the console state
* @con: Console to update
@@ -249,13 +253,16 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
* since all non-panic CPUs are stopped during panic(), it
* is safer to have them avoid gaining console ownership.
*
- * If this acquire is a reacquire (and an unsafe takeover
+ * One exception is when kdb has locked for printing on this CPU.
+ *
+ * Second exception is a reacquire (and an unsafe takeover
* has not previously occurred) then it is allowed to attempt
* a direct acquire in panic. This gives console drivers an
* opportunity to perform any necessary cleanup if they were
* interrupted by the panic CPU while printing.
*/
if (panic_on_other_cpu() &&
+ !kdb_printf_on_this_cpu() &&
(!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
}
@@ -850,8 +857,8 @@ out:
return nbcon_context_can_proceed(ctxt, &cur);
}
-static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
- char *buf, unsigned int len)
+void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+ char *buf, unsigned int len)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
struct console *con = ctxt->console;
@@ -1163,6 +1170,17 @@ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_contex
if (kthread_should_stop())
return true;
+ /*
+ * Block the kthread when the system is in an emergency or panic mode.
+ * It increases the chance that these contexts would be able to show
+ * the messages directly. And it reduces the risk of interrupted writes
+ * where the context with a higher priority takes over the nbcon console
+ * ownership in the middle of a message.
+ */
+ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
+ unlikely(panic_in_progress()))
+ return false;
+
cookie = console_srcu_read_lock();
flags = console_srcu_read_flags(con);
@@ -1214,6 +1232,14 @@ wait_for_event:
if (kthread_should_stop())
return 0;
+ /*
+ * Block the kthread when the system is in an emergency or panic
+ * mode. See nbcon_kthread_should_wakeup() for more details.
+ */
+ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
+ unlikely(panic_in_progress()))
+ goto wait_for_event;
+
backlog = false;
/*
@@ -1276,6 +1302,13 @@ void nbcon_kthreads_wake(void)
if (!printk_kthreads_running)
return;
+ /*
+ * It is not allowed to call this function when console irq_work
+ * is blocked.
+ */
+ if (WARN_ON_ONCE(console_irqwork_blocked))
+ return;
+
cookie = console_srcu_read_lock();
for_each_console_srcu(con) {
if (!(console_srcu_read_flags(con) & CON_NBCON))
@@ -1404,6 +1437,26 @@ enum nbcon_prio nbcon_get_default_prio(void)
return NBCON_PRIO_NORMAL;
}
+/*
+ * Track if it is allowed to perform unsafe hostile takeovers of console
+ * ownership. When true, console drivers might perform unsafe actions while
+ * printing. It is externally available via nbcon_allow_unsafe_takeover().
+ */
+static bool panic_nbcon_allow_unsafe_takeover;
+
+/**
+ * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed
+ *
+ * Return: True, when it is permitted to perform unsafe console printing
+ *
+ * This is also used by console_is_usable() to determine if it is allowed to
+ * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE).
+ */
+bool nbcon_allow_unsafe_takeover(void)
+{
+ return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover;
+}
+
/**
* nbcon_legacy_emit_next_record - Print one record for an nbcon console
* in legacy contexts
@@ -1474,7 +1527,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
* write_atomic() callback
* @con: The nbcon console to flush
* @stop_seq: Flush up until this record
- * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
*
* Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on
* failure.
@@ -1493,8 +1545,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
* returned, it cannot be expected that the unfinalized record will become
* available.
*/
-static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
- bool allow_unsafe_takeover)
+static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
{
struct nbcon_write_context wctxt = { };
struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
@@ -1503,12 +1554,12 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
ctxt->console = con;
ctxt->spinwait_max_us = 2000;
ctxt->prio = nbcon_get_default_prio();
- ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
-
- if (!nbcon_context_try_acquire(ctxt, false))
- return -EPERM;
+ ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover();
while (nbcon_seq_read(con) < stop_seq) {
+ if (!nbcon_context_try_acquire(ctxt, false))
+ return -EPERM;
+
/*
* nbcon_emit_next_record() returns false when the console was
* handed over or taken over. In both cases the context is no
@@ -1517,6 +1568,8 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
if (!nbcon_emit_next_record(&wctxt, true))
return -EAGAIN;
+ nbcon_context_release(ctxt);
+
if (!ctxt->backlog) {
/* Are there reserved but not yet finalized records? */
if (nbcon_seq_read(con) < stop_seq)
@@ -1525,7 +1578,6 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
}
}
- nbcon_context_release(ctxt);
return err;
}
@@ -1534,15 +1586,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
* write_atomic() callback
* @con: The nbcon console to flush
* @stop_seq: Flush up until this record
- * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
*
* This will stop flushing before @stop_seq if another context has ownership.
* That context is then responsible for the flushing. Likewise, if new records
* are added while this context was flushing and there is no other context
* to handle the printing, this context must also flush those records.
*/
-static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
- bool allow_unsafe_takeover)
+static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
{
struct console_flush_type ft;
unsigned long flags;
@@ -1557,7 +1607,7 @@ again:
*/
local_irq_save(flags);
- err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+ err = __nbcon_atomic_flush_pending_con(con, stop_seq);
local_irq_restore(flags);
@@ -1589,9 +1639,8 @@ again:
* __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
* write_atomic() callback
* @stop_seq: Flush up until this record
- * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
*/
-static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
+static void __nbcon_atomic_flush_pending(u64 stop_seq)
{
struct console *con;
int cookie;
@@ -1609,7 +1658,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove
if (nbcon_seq_read(con) >= stop_seq)
continue;
- nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+ nbcon_atomic_flush_pending_con(con, stop_seq);
}
console_srcu_read_unlock(cookie);
}
@@ -1625,7 +1674,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove
*/
void nbcon_atomic_flush_pending(void)
{
- __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
}
/**
@@ -1637,7 +1686,9 @@ void nbcon_atomic_flush_pending(void)
*/
void nbcon_atomic_flush_unsafe(void)
{
- __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
+ panic_nbcon_allow_unsafe_takeover = true;
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
+ panic_nbcon_allow_unsafe_takeover = false;
}
/**
@@ -1655,6 +1706,8 @@ void nbcon_cpu_emergency_enter(void)
preempt_disable();
+ atomic_inc(&nbcon_cpu_emergency_cnt);
+
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
(*cpu_emergency_nesting)++;
}
@@ -1669,10 +1722,24 @@ void nbcon_cpu_emergency_exit(void)
unsigned int *cpu_emergency_nesting;
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
-
if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
(*cpu_emergency_nesting)--;
+ /*
+ * Wake up kthreads because there might be some pending messages
+ * added by other CPUs with normal priority since the last flush
+ * in the emergency context.
+ */
+ if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) {
+ if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) {
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ }
+ }
+
preempt_enable();
}
@@ -1844,14 +1911,75 @@ void nbcon_device_release(struct console *con)
* using the legacy loop.
*/
if (ft.nbcon_atomic) {
- __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
+ __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb));
} else if (ft.legacy_direct) {
if (console_trylock())
console_unlock();
} else if (ft.legacy_offload) {
- printk_trigger_flush();
+ defer_console_output();
}
}
console_srcu_read_unlock(cookie);
}
EXPORT_SYMBOL_GPL(nbcon_device_release);
+
+/**
+ * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe
+ * section
+ * @con: The nbcon console to acquire
+ * @wctxt: The nbcon write context to be used on success
+ *
+ * Context: Under console_srcu_read_lock() for emitting a single kdb message
+ * using the given con->write_atomic() callback. Can be called
+ * only when the console is usable at the moment.
+ *
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * kdb emits messages on consoles registered for printk() without
+ * storing them into the ring buffer. It has to acquire the console
+ * ownerhip so that it could call con->write_atomic() callback a safe way.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_kdb_try_acquire(struct console *con,
+ struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->prio = NBCON_PRIO_EMERGENCY;
+
+ if (!nbcon_context_try_acquire(ctxt, false))
+ return false;
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_kdb_release - Exit unsafe section and release the nbcon console
+ *
+ * @wctxt: The nbcon write context initialized by a successful
+ * nbcon_kdb_try_acquire()
+ */
+void nbcon_kdb_release(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return;
+
+ nbcon_context_release(ctxt);
+
+ /*
+ * Flush any new printk() messages added when the console was blocked.
+ * Only the console used by the given write context was blocked.
+ * The console was locked only when the write_atomic() callback
+ * was usable.
+ */
+ __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb));
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5aee9ffb16b9..1d765ad242b8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -462,6 +462,9 @@ bool have_boot_console;
/* See printk_legacy_allow_panic_sync() for details. */
bool legacy_allow_panic_sync;
+/* Avoid using irq_work when suspending. */
+bool console_irqwork_blocked;
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
@@ -2390,7 +2393,7 @@ asmlinkage int vprintk_emit(int facility, int level,
/* If called from the scheduler, we can not call up(). */
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
- ft.legacy_offload |= ft.legacy_direct;
+ ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked;
ft.legacy_direct = false;
}
@@ -2426,7 +2429,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (ft.legacy_offload)
defer_console_output();
- else
+ else if (!console_irqwork_blocked)
wake_up_klogd();
return printed_len;
@@ -2730,10 +2733,20 @@ void console_suspend_all(void)
{
struct console *con;
+ if (console_suspend_enabled)
+ pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
+
+ /*
+ * Flush any console backlog and then avoid queueing irq_work until
+ * console_resume_all(). Until then deferred printing is no longer
+ * triggered, NBCON consoles transition to atomic flushing, and
+ * any klogd waiters are not triggered.
+ */
+ pr_flush(1000, true);
+ console_irqwork_blocked = true;
+
if (!console_suspend_enabled)
return;
- pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
- pr_flush(1000, true);
console_list_lock();
for_each_console(con)
@@ -2754,26 +2767,34 @@ void console_resume_all(void)
struct console_flush_type ft;
struct console *con;
- if (!console_suspend_enabled)
- return;
-
- console_list_lock();
- for_each_console(con)
- console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
- console_list_unlock();
-
/*
- * Ensure that all SRCU list walks have completed. All printing
- * contexts must be able to see they are no longer suspended so
- * that they are guaranteed to wake up and resume printing.
+ * Allow queueing irq_work. After restoring console state, deferred
+ * printing and any klogd waiters need to be triggered in case there
+ * is now a console backlog.
*/
- synchronize_srcu(&console_srcu);
+ console_irqwork_blocked = false;
+
+ if (console_suspend_enabled) {
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+ }
printk_get_console_flush_type(&ft);
if (ft.nbcon_offload)
nbcon_kthreads_wake();
if (ft.legacy_offload)
defer_console_output();
+ else
+ wake_up_klogd();
pr_flush(1000, true);
}
@@ -3002,21 +3023,18 @@ out:
}
/*
- * Legacy console printing from printk() caller context does not respect
- * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
- * false positive. For PREEMPT_RT the false positive condition does not
- * occur.
- *
- * This map is used to temporarily establish LD_WAIT_SLEEP context for the
- * console write() callback when legacy printing to avoid false positive
- * lockdep complaints, thus allowing lockdep to continue to function for
- * real issues.
+ * The legacy console always acquires a spinlock_t from its printing
+ * callback. This violates lock nesting if the caller acquired an always
+ * spinning lock (raw_spinlock_t) while invoking printk(). This is not a
+ * problem on PREEMPT_RT because legacy consoles print always from a
+ * dedicated thread and never from within printk(). Therefore we tell
+ * lockdep that a sleeping spin lock (spinlock_t) is valid here.
*/
#ifdef CONFIG_PREEMPT_RT
static inline void printk_legacy_allow_spinlock_enter(void) { }
static inline void printk_legacy_allow_spinlock_exit(void) { }
#else
-static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);
+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG);
static inline void printk_legacy_allow_spinlock_enter(void)
{
@@ -3134,104 +3152,147 @@ static inline void printk_kthreads_check_locked(void) { }
#endif /* CONFIG_PRINTK */
+
/*
- * Print out all remaining records to all consoles.
+ * Print out one record for each console.
*
* @do_cond_resched is set by the caller. It can be true only in schedulable
* context.
*
* @next_seq is set to the sequence number after the last available record.
- * The value is valid only when this function returns true. It means that all
- * usable consoles are completely flushed.
+ * The value is valid only when all usable consoles were flushed. It is
+ * when the function returns true (can do the job) and @try_again parameter
+ * is set to false, see below.
*
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding the
* console_lock. Otherwise it is set to false.
*
- * Returns true when there was at least one usable console and all messages
- * were flushed to all usable consoles. A returned false informs the caller
- * that everything was not flushed (either there were no usable consoles or
- * another context has taken over printing or it is a panic situation and this
- * is not the panic CPU). Regardless the reason, the caller should assume it
- * is not useful to immediately try again.
+ * @try_again will be set to true when it still makes sense to call this
+ * function again. The function could do the job, see the return value.
+ * And some consoles still make progress.
+ *
+ * Returns true when the function could do the job. Some consoles are usable,
+ * and there was no takeover and no panic_on_other_cpu().
*
* Requires the console_lock.
*/
-static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover,
+ bool *try_again)
{
struct console_flush_type ft;
bool any_usable = false;
struct console *con;
- bool any_progress;
int cookie;
- *next_seq = 0;
- *handover = false;
+ *try_again = false;
- do {
- any_progress = false;
+ printk_get_console_flush_type(&ft);
- printk_get_console_flush_type(&ft);
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+ bool progress;
- cookie = console_srcu_read_lock();
- for_each_console_srcu(con) {
- short flags = console_srcu_read_flags(con);
- u64 printk_seq;
- bool progress;
+ /*
+ * console_flush_one_record() is only responsible for
+ * nbcon consoles when the nbcon consoles cannot print via
+ * their atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
- /*
- * console_flush_all() is only responsible for nbcon
- * consoles when the nbcon consoles cannot print via
- * their atomic or threaded flushing.
- */
- if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
- continue;
+ if (!console_is_usable(con, flags, !do_cond_resched))
+ continue;
+ any_usable = true;
- if (!console_is_usable(con, flags, !do_cond_resched))
- continue;
- any_usable = true;
+ if (flags & CON_NBCON) {
+ progress = nbcon_legacy_emit_next_record(con, handover, cookie,
+ !do_cond_resched);
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ progress = console_emit_next_record(con, handover, cookie);
+ printk_seq = con->seq;
+ }
- if (flags & CON_NBCON) {
- progress = nbcon_legacy_emit_next_record(con, handover, cookie,
- !do_cond_resched);
- printk_seq = nbcon_seq_read(con);
- } else {
- progress = console_emit_next_record(con, handover, cookie);
- printk_seq = con->seq;
- }
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
+ if (*handover)
+ goto fail;
- /*
- * If a handover has occurred, the SRCU read lock
- * is already released.
- */
- if (*handover)
- return false;
+ /* Track the next of the highest seq flushed. */
+ if (printk_seq > *next_seq)
+ *next_seq = printk_seq;
- /* Track the next of the highest seq flushed. */
- if (printk_seq > *next_seq)
- *next_seq = printk_seq;
+ if (!progress)
+ continue;
- if (!progress)
- continue;
- any_progress = true;
+ /*
+ * An usable console made a progress. There might still be
+ * pending messages.
+ */
+ *try_again = true;
- /* Allow panic_cpu to take over the consoles safely. */
- if (panic_on_other_cpu())
- goto abandon;
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (panic_on_other_cpu())
+ goto fail_srcu;
- if (do_cond_resched)
- cond_resched();
- }
- console_srcu_read_unlock(cookie);
- } while (any_progress);
+ if (do_cond_resched)
+ cond_resched();
+ }
+ console_srcu_read_unlock(cookie);
return any_usable;
-abandon:
+fail_srcu:
console_srcu_read_unlock(cookie);
+fail:
+ *try_again = false;
return false;
}
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ bool try_again;
+ bool ret;
+
+ *next_seq = 0;
+ *handover = false;
+
+ do {
+ ret = console_flush_one_record(do_cond_resched, next_seq,
+ handover, &try_again);
+ } while (try_again);
+
+ return ret;
+}
+
static void __console_flush_and_unlock(void)
{
bool do_cond_resched;
@@ -3331,12 +3392,10 @@ void console_unblank(void)
*/
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- short flags = console_srcu_read_flags(c);
-
- if (flags & CON_SUSPENDED)
+ if (!console_is_usable(c, console_srcu_read_flags(c), true))
continue;
- if ((flags & CON_ENABLED) && c->unblank) {
+ if (c->unblank) {
found_unblank = true;
break;
}
@@ -3373,12 +3432,10 @@ void console_unblank(void)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- short flags = console_srcu_read_flags(c);
-
- if (flags & CON_SUSPENDED)
+ if (!console_is_usable(c, console_srcu_read_flags(c), true))
continue;
- if ((flags & CON_ENABLED) && c->unblank)
+ if (c->unblank)
c->unblank();
}
console_srcu_read_unlock(cookie);
@@ -3601,17 +3658,26 @@ static bool legacy_kthread_should_wakeup(void)
static int legacy_kthread_func(void *unused)
{
- for (;;) {
- wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());
+ bool try_again;
+
+wait_for_event:
+ wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());
+
+ do {
+ bool handover = false;
+ u64 next_seq = 0;
if (kthread_should_stop())
- break;
+ return 0;
console_lock();
- __console_flush_and_unlock();
- }
+ console_flush_one_record(true, &next_seq, &handover, &try_again);
+ if (!handover)
+ __console_unlock();
- return 0;
+ } while (try_again);
+
+ goto wait_for_event;
}
static bool legacy_kthread_create(void)
@@ -3639,12 +3705,13 @@ static bool legacy_kthread_create(void)
/**
* printk_kthreads_shutdown - shutdown all threaded printers
+ * @data: syscore context
*
* On system shutdown all threaded printers are stopped. This allows printk
* to transition back to atomic printing, thus providing a robust mechanism
* for the final shutdown/reboot messages to be output.
*/
-static void printk_kthreads_shutdown(void)
+static void printk_kthreads_shutdown(void *data)
{
struct console *con;
@@ -3666,10 +3733,14 @@ static void printk_kthreads_shutdown(void)
console_list_unlock();
}
-static struct syscore_ops printk_syscore_ops = {
+static const struct syscore_ops printk_syscore_ops = {
.shutdown = printk_kthreads_shutdown,
};
+static struct syscore printk_syscore = {
+ .ops = &printk_syscore_ops,
+};
+
/*
* If appropriate, start nbcon kthreads and set @printk_kthreads_running.
* If any kthreads fail to start, those consoles are unregistered.
@@ -3737,7 +3808,7 @@ static void printk_kthreads_check_locked(void)
static int __init printk_set_kthreads_ready(void)
{
- register_syscore_ops(&printk_syscore_ops);
+ register_syscore(&printk_syscore);
console_list_lock();
printk_kthreads_ready = true;
@@ -4511,6 +4582,13 @@ static void __wake_up_klogd(int val)
if (!printk_percpu_data_ready())
return;
+ /*
+ * It is not allowed to call this function when console irq_work
+ * is blocked.
+ */
+ if (WARN_ON_ONCE(console_irqwork_blocked))
+ return;
+
preempt_disable();
/*
* Guarantee any new records can be seen by tasks preparing to wait
@@ -4567,9 +4645,30 @@ void defer_console_output(void)
__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}
+/**
+ * printk_trigger_flush - Attempt to flush printk buffer to consoles.
+ *
+ * If possible, flush the printk buffer to all consoles in the caller's
+ * context. If offloading is available, trigger deferred printing.
+ *
+ * This is best effort. Depending on the system state, console states,
+ * and caller context, no actual flushing may result from this call.
+ */
void printk_trigger_flush(void)
{
- defer_console_output();
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
+ if (ft.legacy_offload)
+ defer_console_output();
}
int vprintk_deferred(const char *fmt, va_list args)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 40198bffb7d0..56c8e3d031f4 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -411,6 +411,23 @@ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
return to_blk_size(size) <= DATA_SIZE(data_ring) / 2;
}
+/*
+ * Compare the current and requested logical position and decide
+ * whether more space is needed.
+ *
+ * Return false when @lpos_current is already at or beyond @lpos_target.
+ *
+ * Also return false when the difference between the positions is bigger
+ * than the size of the data buffer. It might happen only when the caller
+ * raced with another CPU(s) which already made and used the space.
+ */
+static bool need_more_space(struct prb_data_ring *data_ring,
+ unsigned long lpos_current,
+ unsigned long lpos_target)
+{
+ return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring);
+}
+
/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
unsigned long state_val)
@@ -577,7 +594,7 @@ static bool data_make_reusable(struct printk_ringbuffer *rb,
unsigned long id;
/* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
- while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
+ while (need_more_space(data_ring, lpos_begin, lpos_end)) {
blk = to_block(data_ring, lpos_begin);
/*
@@ -668,7 +685,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
* sees the new tail lpos, any descriptor states that transitioned to
* the reusable state must already be visible.
*/
- while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
+ while (need_more_space(data_ring, tail_lpos, lpos)) {
/*
* Make all descriptors reusable that are associated with
* data blocks before @lpos.
@@ -999,6 +1016,17 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
return true;
}
+static bool is_blk_wrapped(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos, unsigned long next_lpos)
+{
+ /*
+ * Subtract one from next_lpos since it's not actually part of this data
+ * block. This allows perfectly fitting records to not wrap.
+ */
+ return DATA_WRAPS(data_ring, begin_lpos) !=
+ DATA_WRAPS(data_ring, next_lpos - 1);
+}
+
/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
unsigned long lpos, unsigned int size)
@@ -1010,7 +1038,7 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
next_lpos = lpos + size;
/* First check if the data block does not wrap. */
- if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
+ if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos))
return next_lpos;
/* Wrapping data blocks store their data at the beginning. */
@@ -1087,7 +1115,7 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
blk = to_block(data_ring, begin_lpos);
blk->id = id; /* LMM(data_alloc:B) */
- if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
+ if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) {
/* Wrapping data blocks store their data at the beginning. */
blk = to_block(data_ring, 0);
@@ -1131,14 +1159,21 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
return NULL;
/* Keep track if @blk_lpos was a wrapping data block. */
- wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
+ wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next);
size = to_blk_size(size);
next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
- /* If the data block does not increase, there is nothing to do. */
- if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
+ /*
+ * Use the current data block when the size does not increase, i.e.
+ * when @head_lpos is already able to accommodate the new @next_lpos.
+ *
+ * Note that need_more_space() could never return false here because
+ * the difference between the positions was bigger than the data
+ * buffer size. The data block is reopened and can't get reused.
+ */
+ if (!need_more_space(data_ring, head_lpos, next_lpos)) {
if (wrapped)
blk = to_block(data_ring, 0);
else
@@ -1167,7 +1202,7 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
blk = to_block(data_ring, blk_lpos->begin);
- if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
+ if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) {
struct prb_data_block *old_blk = blk;
/* Wrapping data blocks store their data at the beginning. */
@@ -1203,7 +1238,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring,
if (BLK_DATALESS(blk_lpos))
return 0;
- if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
+ if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
/* Data block does not wrap. */
return (DATA_INDEX(data_ring, blk_lpos->next) -
DATA_INDEX(data_ring, blk_lpos->begin));
@@ -1249,15 +1284,15 @@ static const char *get_data(struct prb_data_ring *data_ring,
return NULL;
}
- /* Regular data block: @begin less than @next and in same wrap. */
- if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
- blk_lpos->begin < blk_lpos->next) {
+ /* Regular data block: @begin and @next in the same wrap. */
+ if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
db = to_block(data_ring, blk_lpos->begin);
*data_size = blk_lpos->next - blk_lpos->begin;
/* Wrapping data block: @begin is one wrap behind @next. */
- } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
- DATA_WRAPS(data_ring, blk_lpos->next)) {
+ } else if (!is_blk_wrapped(data_ring,
+ blk_lpos->begin + DATA_SIZE(data_ring),
+ blk_lpos->next)) {
db = to_block(data_ring, 0);
*data_size = DATA_INDEX(data_ring, blk_lpos->next);
@@ -1267,6 +1302,10 @@ static const char *get_data(struct prb_data_ring *data_ring,
return NULL;
}
+ /* Sanity check. Data-less blocks were handled earlier. */
+ if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size))
+ return NULL;
+
/* A valid data block will always be aligned to the ID size. */
if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 12e4c64ebae1..625d75392647 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD
when looking for certain types of RCU usage bugs, for example,
too-short RCU read-side critical sections.
+
+config RCU_DYNTICKS_TORTURE
+ bool "Minimize RCU dynticks counter size"
+ depends on RCU_EXPERT && !COMPILE_TEST
+ default n
+ help
+ This option sets the width of the dynticks counter to its
+ minimum usable value. This minimum width greatly increases
+ the probability of flushing out bugs involving counter wrap,
+ but it also increases the probability of extending grace period
+ durations. This Kconfig option should therefore be avoided in
+ production due to the consequent increased probability of OOMs.
+
+ This has no value for production and is only for testing.
+
endmenu # "RCU Debugging"
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 29fe3c01312f..07e51974b06b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -389,6 +389,7 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ void (*exp_current)(void);
unsigned long (*get_gp_state_exp)(void);
unsigned long (*start_gp_poll_exp)(void);
void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
@@ -691,10 +692,29 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+DEFINE_STATIC_SRCU_FAST(srcu_ctlf);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
+static void srcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL))
+ VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
+ VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ srcu_ctlp = &srcu_ctlf;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU");
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ srcu_ctlp = &srcu_ctlfud;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU");
+ }
+}
+
static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
{
srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
@@ -722,6 +742,12 @@ static int srcu_torture_read_lock(void)
scp = srcu_read_lock_fast(srcu_ctlp);
idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 2;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
ret += idx << 3;
}
return ret;
@@ -749,8 +775,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+ srcu_read_unlock_fast_updown(srcu_ctlp,
+ __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
if (reader_flavor & SRCU_READ_FLAVOR_FAST)
- srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2));
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -784,7 +813,7 @@ static int srcu_torture_down_read(void)
WARN_ON_ONCE(idx & ~0x1);
return idx;
}
- if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
scp = srcu_down_read_fast(srcu_ctlp);
idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
WARN_ON_ONCE(idx & ~0x1);
@@ -797,7 +826,7 @@ static int srcu_torture_down_read(void)
static void srcu_torture_up_read(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
- if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
!(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -857,9 +886,14 @@ static void srcu_torture_synchronize_expedited(void)
synchronize_srcu_expedited(srcu_ctlp);
}
+static void srcu_torture_expedite_current(void)
+{
+ srcu_expedite_current(srcu_ctlp);
+}
+
static struct rcu_torture_ops srcu_ops = {
.ttype = SRCU_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = srcu_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -871,6 +905,7 @@ static struct rcu_torture_ops srcu_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
.same_gp_state = same_state_synchronize_srcu,
.get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
@@ -886,14 +921,28 @@ static struct rcu_torture_ops srcu_ops = {
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
.have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
- ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
.name = "srcu"
};
-static void srcu_torture_init(void)
+static void srcud_torture_init(void)
{
rcu_sync_torture_init();
- WARN_ON(init_srcu_struct(&srcu_ctld));
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ WARN_ON(init_srcu_struct_fast(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU");
+ } else {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ }
srcu_ctlp = &srcu_ctld;
}
@@ -906,7 +955,7 @@ static void srcu_torture_cleanup(void)
/* As above, but dynamically allocated. */
static struct rcu_torture_ops srcud_ops = {
.ttype = SRCU_FLAVOR,
- .init = srcu_torture_init,
+ .init = srcud_torture_init,
.cleanup = srcu_torture_cleanup,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
@@ -919,6 +968,7 @@ static struct rcu_torture_ops srcud_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
.same_gp_state = same_state_synchronize_srcu,
.get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
@@ -934,7 +984,7 @@ static struct rcu_torture_ops srcud_ops = {
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
.have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
- ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
.name = "srcud"
};
@@ -1700,6 +1750,8 @@ rcu_torture_writer(void *arg)
ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
+ if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ cur_ops->exp_current();
while (!cur_ops->poll_gp_state(gp_snap)) {
gp_snap1 = cur_ops->get_gp_state();
for (i = 0; i < ulo_size; i++)
@@ -1720,6 +1772,8 @@ rcu_torture_writer(void *arg)
cur_ops->get_comp_state_full(&rgo[i]);
cur_ops->start_gp_poll_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ cur_ops->exp_current();
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
cur_ops->get_gp_state_full(&gp_snap1_full);
for (i = 0; i < rgo_size; i++)
@@ -2384,10 +2438,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
newstate = rcutorture_extend_mask(rtors.readstate, trsp);
WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN);
rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++);
- if (!rcu_torture_one_read_start(&rtors, trsp, myid)) {
- rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp);
+ if (!rcu_torture_one_read_start(&rtors, trsp, myid))
return false;
- }
rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp);
rcu_torture_one_read_end(&rtors, trsp);
return true;
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 19841704d8f5..07a313782dfd 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -136,6 +136,7 @@ struct ref_scale_ops {
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
+ bool enable_irqs;
const char *name;
};
@@ -184,6 +185,8 @@ static const struct ref_scale_ops rcu_ops = {
// Definitions for SRCU ref scale testing.
DEFINE_STATIC_SRCU(srcu_refctl_scale);
+DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale);
static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
static void srcu_ref_scale_read_section(const int nloops)
@@ -216,6 +219,12 @@ static const struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+static bool srcu_fast_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_refctl_scale;
+ return true;
+}
+
static void srcu_fast_ref_scale_read_section(const int nloops)
{
int i;
@@ -240,12 +249,48 @@ static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, c
}
static const struct ref_scale_ops srcu_fast_ops = {
- .init = rcu_sync_scale_init,
+ .init = srcu_fast_sync_scale_init,
.readsection = srcu_fast_ref_scale_read_section,
.delaysection = srcu_fast_ref_scale_delay_section,
.name = "srcu-fast"
};
+static bool srcu_fast_updown_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_updown_refctl_scale;
+ return true;
+}
+
+static void srcu_fast_updown_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_updown_ops = {
+ .init = srcu_fast_updown_sync_scale_init,
+ .readsection = srcu_fast_updown_ref_scale_read_section,
+ .delaysection = srcu_fast_updown_ref_scale_delay_section,
+ .name = "srcu-fast-updown"
+};
+
#ifdef CONFIG_TASKS_RCU
// Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -323,6 +368,9 @@ static const struct ref_scale_ops rcu_trace_ops = {
// Definitions for reference count
static atomic_t refcnt;
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
static void ref_refcnt_section(const int nloops)
{
int i;
@@ -351,6 +399,184 @@ static const struct ref_scale_ops refcnt_ops = {
.name = "refcnt"
};
+static void ref_percpuinc_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ un_delay(udl, ndl);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static const struct ref_scale_ops percpuinc_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_percpuinc_section,
+ .delaysection = ref_percpuinc_delay_section,
+ .name = "percpuinc"
+};
+
+// Note that this can lose counts in preemptible kernels.
+static void ref_incpercpu_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static const struct ref_scale_ops incpercpu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpu_section,
+ .delaysection = ref_incpercpu_delay_section,
+ .name = "incpercpu"
+};
+
+static void ref_incpercpupreempt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpupreempt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpupreempt_section,
+ .delaysection = ref_incpercpupreempt_delay_section,
+ .name = "incpercpupreempt"
+};
+
+static void ref_incpercpubh_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpubh_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpubh_section,
+ .delaysection = ref_incpercpubh_delay_section,
+ .enable_irqs = true,
+ .name = "incpercpubh"
+};
+
+static void ref_incpercpuirqsave_section(const int nloops)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static const struct ref_scale_ops incpercpuirqsave_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpuirqsave_section,
+ .delaysection = ref_incpercpuirqsave_delay_section,
+ .name = "incpercpuirqsave"
+};
+
// Definitions for rwlock
static rwlock_t test_rwlock;
@@ -494,9 +720,6 @@ static const struct ref_scale_ops lock_irq_ops = {
.name = "lock-irq"
};
-// Definitions acquire-release.
-static DEFINE_PER_CPU(unsigned long, test_acqrel);
-
static void ref_acqrel_section(const int nloops)
{
unsigned long x;
@@ -629,6 +852,133 @@ static const struct ref_scale_ops jiffies_ops = {
.name = "jiffies"
};
+static void ref_preempt_section(const int nloops)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ un_delay(udl, ndl);
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static const struct ref_scale_ops preempt_ops = {
+ .readsection = ref_preempt_section,
+ .delaysection = ref_preempt_delay_section,
+ .name = "preempt"
+};
+
+static void ref_bh_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_bh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ un_delay(udl, ndl);
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops bh_ops = {
+ .readsection = ref_bh_section,
+ .delaysection = ref_bh_delay_section,
+ .enable_irqs = true,
+ .name = "bh"
+};
+
+static void ref_irq_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ un_delay(udl, ndl);
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irq_ops = {
+ .readsection = ref_irq_section,
+ .delaysection = ref_irq_delay_section,
+ .name = "irq"
+};
+
+static void ref_irqsave_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ un_delay(udl, ndl);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irqsave_ops = {
+ .readsection = ref_irqsave_section,
+ .delaysection = ref_irqsave_delay_section,
+ .name = "irqsave"
+};
+
////////////////////////////////////////////////////////////////////////
//
// Methods leveraging SLAB_TYPESAFE_BY_RCU.
@@ -924,15 +1274,18 @@ repeat:
if (!atomic_dec_return(&n_warmedup))
while (atomic_read_acquire(&n_warmedup))
rcu_scale_one_reader();
- // Also keep interrupts disabled. This also has the effect
- // of preventing entries into slow path for rcu_read_unlock().
- local_irq_save(flags);
+ // Also keep interrupts disabled when it is safe to do so, which
+ // it is not for local_bh_enable(). This also has the effect of
+ // preventing entries into slow path for rcu_read_unlock().
+ if (!cur_ops->enable_irqs)
+ local_irq_save(flags);
start = ktime_get_mono_fast_ns();
rcu_scale_one_reader();
duration = ktime_get_mono_fast_ns() - start;
- local_irq_restore(flags);
+ if (!cur_ops->enable_irqs)
+ local_irq_restore(flags);
rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
// To reduce runtime-skew noise, do maintain-load invocations until
@@ -1163,9 +1516,13 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS
- &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
- &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops,
+ RCU_TRACE_OPS RCU_TASKS_OPS
+ &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops,
+ &incpercpubh_ops, &incpercpuirqsave_ops,
+ &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index e3b64a5e0ec7..3450c3751ef7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -106,15 +106,15 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
preempt_enable();
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* Workqueue handler to drive one grace period and invoke any callbacks
- * that become ready as a result. Single-CPU and !PREEMPTION operation
- * means that we get away with murder on synchronization. ;-)
+ * that become ready as a result. Single-CPU operation and preemption
+ * disabling mean that we get away with murder on synchronization. ;-)
*/
void srcu_drive_gp(struct work_struct *wp)
{
@@ -141,7 +141,12 @@ void srcu_drive_gp(struct work_struct *wp)
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
preempt_enable();
- swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ do {
+ // Deadlock issues prevent __srcu_read_unlock() from
+ // doing an unconditional wakeup, so polling is required.
+ swait_event_timeout_exclusive(ssp->srcu_wq,
+ !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10);
+ } while (READ_ONCE(ssp->srcu_lock_nesting[idx]));
preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1ff94b76d91f..ea3f128de06f 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -286,32 +286,92 @@ err_free_sup:
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
- struct lock_class_key *key)
+static int
+__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
{
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
return init_srcu_struct_fields(ssp, false);
}
+
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = 0;
+ return __init_srcu_struct_common(ssp, name, key);
+}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
+int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast);
+
+int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
+ struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown);
+
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
* init_srcu_struct - initialize a sleep-RCU structure
* @ssp: structure to initialize.
*
- * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary
+ * to invoke this on a given srcu_struct before passing that srcu_struct
* to any other function. Each srcu_struct represents a separate domain
* of SRCU protection.
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
+ ssp->srcu_reader_flavor = 0;
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
+/**
+ * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock_fast() and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast);
+
+/**
+ * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and
+ * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct
+ * structures that are to be passed to srcu_read_lock_fast_updown(),
+ * srcu_down_read_fast(), and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast_updown(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown);
+
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
@@ -461,7 +521,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
int cpu;
- unsigned long mask = 0;
+ unsigned long mask = ssp->srcu_reader_flavor;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
@@ -734,6 +794,10 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
sdp = raw_cpu_ptr(ssp->sda);
old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+ WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor &&
+ old_read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor);
if (!old_read_flavor) {
old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
if (!old_read_flavor)
@@ -1688,6 +1752,64 @@ void srcu_barrier(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(srcu_barrier);
+/* Callback for srcu_expedite_current() usage. */
+static void srcu_expedite_current_cb(struct rcu_head *rhp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head);
+
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ WARN_ON_ONCE(1);
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_IDLE;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, requeue ourselves as an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+}
+
+/**
+ * srcu_expedite_current - Expedite the current SRCU grace period
+ * @ssp: srcu_struct to expedite.
+ *
+ * Cause the current SRCU grace period to become expedited. The grace
+ * period following the current one might also be expedited. If there is
+ * no current grace period, one might be created. If the current grace
+ * period is currently sleeping, that sleep will complete before expediting
+ * will take effect.
+ */
+void srcu_expedite_current(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp;
+
+ migrate_disable();
+ sdp = this_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_REPOST;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, queue an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+ migrate_enable();
+}
+EXPORT_SYMBOL_GPL(srcu_expedite_current);
+
/**
* srcu_batches_completed - return batches completed.
* @ssp: srcu_struct on which to report batch completion.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 85b82a7007b9..293bbd9ac3f4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4017,7 +4017,7 @@ bool rcu_cpu_online(int cpu)
* RCU on an offline processor during initial boot, hence the check for
* rcu_scheduler_fully_active.
*/
-bool rcu_lockdep_current_cpu_online(void)
+bool notrace rcu_lockdep_current_cpu_online(void)
{
struct rcu_data *rdp;
bool ret = false;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c912b594ba98..dfeba9b35395 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -117,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret)
return false;
}
-int rcu_read_lock_sched_held(void)
+int notrace rcu_read_lock_sched_held(void)
{
bool ret;
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
* Note that rcu_read_lock() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_held(void)
+int notrace rcu_read_lock_held(void)
{
bool ret;
@@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
* Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_bh_held(void)
+int notrace rcu_read_lock_bh_held(void)
{
bool ret;
@@ -377,7 +377,7 @@ int rcu_read_lock_bh_held(void)
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
-int rcu_read_lock_any_held(void)
+int notrace rcu_read_lock_any_held(void)
{
bool ret;
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d915fe98198..e36f6b926f7f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -72,17 +72,18 @@ static void relay_free_page_array(struct page **array)
}
/**
- * relay_mmap_buf: - mmap channel buffer to process address space
- * @buf: relay channel buffer
- * @vma: vm_area_struct describing memory to be mapped
+ * relay_mmap_prepare_buf: - mmap channel buffer to process address space
+ * @buf: the relay channel buffer
+ * @desc: describing what to map
*
* Returns 0 if ok, negative on error
*
* Caller should already have grabbed mmap_lock.
*/
-static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+static int relay_mmap_prepare_buf(struct rchan_buf *buf,
+ struct vm_area_desc *desc)
{
- unsigned long length = vma->vm_end - vma->vm_start;
+ unsigned long length = vma_desc_size(desc);
if (!buf)
return -EBADF;
@@ -90,9 +91,9 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
if (length != (unsigned long)buf->chan->alloc_size)
return -EINVAL;
- vma->vm_ops = &relay_file_mmap_ops;
- vm_flags_set(vma, VM_DONTEXPAND);
- vma->vm_private_data = buf;
+ desc->vm_ops = &relay_file_mmap_ops;
+ desc->vm_flags |= VM_DONTEXPAND;
+ desc->private_data = buf;
return 0;
}
@@ -749,16 +750,16 @@ static int relay_file_open(struct inode *inode, struct file *filp)
}
/**
- * relay_file_mmap - mmap file op for relay files
- * @filp: the file
- * @vma: the vma describing what to map
+ * relay_file_mmap_prepare - mmap file op for relay files
+ * @desc: describing what to map
*
- * Calls upon relay_mmap_buf() to map the file into user space.
+ * Calls upon relay_mmap_prepare_buf() to map the file into user space.
*/
-static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap_prepare(struct vm_area_desc *desc)
{
- struct rchan_buf *buf = filp->private_data;
- return relay_mmap_buf(buf, vma);
+ struct rchan_buf *buf = desc->file->private_data;
+
+ return relay_mmap_prepare_buf(buf, desc);
}
/**
@@ -1006,7 +1007,7 @@ static ssize_t relay_file_read(struct file *filp,
const struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
- .mmap = relay_file_mmap,
+ .mmap_prepare = relay_file_mmap_prepare,
.read = relay_file_read,
.release = relay_file_release,
};
diff --git a/kernel/resource.c b/kernel/resource.c
index b9fa2a4ce089..e4e9bac12e6e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
struct resource *res)
{
+ /* Skip children until we find a top level range that matches */
+ bool skip_children = true;
struct resource *p;
if (!res)
@@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, false) {
+ for_each_resource(&iomem_resource, p, skip_children) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p->end < start)
continue;
+ /*
+ * We found a top level range that matches what we are looking
+ * for. Time to start checking children too.
+ */
+ skip_children = false;
+
/* Found a match, break */
if (is_type_match(p, flags, desc))
break;
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index cdea931aae30..954137775f38 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* this process can already run with task_group() == prev->tg or we can
* race with cgroup code which can read autogroup = prev under rq->lock.
* In the latter case for_each_thread() can not miss a migrating thread,
- * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
- * can't be removed from thread list, we hold ->siglock.
+ * cpu_cgroup_attach() must not be possible after cgroup_task_exit()
+ * and it can't be removed from thread list, we hold ->siglock.
*
* If an exiting thread was already removed from thread list we rely on
* sched_autogroup_exit_task().
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc358c1b6ca9..41ba0be16911 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -878,7 +878,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->donor->sched_class->task_tick(rq, rq->curr, 1);
+ rq->donor->sched_class->task_tick(rq, rq->donor, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -5143,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ /*
+ * sched_ext_dead() must come before cgroup_task_dead() to
+ * prevent cgroups from being removed while its member tasks are
+ * visible to SCX schedulers.
+ */
+ sched_ext_dead(prev);
+ cgroup_task_dead(prev);
+
/* Task is done with its stack. */
put_task_stack(prev);
@@ -7352,15 +7360,12 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->prio = prio;
}
out_unlock:
- /* Avoid rq from going away on us: */
- preempt_disable();
+ /* Caller holds task_struct::pi_lock, IRQs are still disabled */
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
rq_repin_lock(rq, &rf);
__task_rq_unlock(rq, p, &rf);
-
- preempt_enable();
}
#endif /* CONFIG_RT_MUTEXES */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 67f540c23717..319439fe1870 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2675,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
return NULL;
}
+/* Access rule: must be called on local CPU with preemption disabled */
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -3117,11 +3118,43 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+/*
+ * This function always returns a non-empty bitmap in @cpus. This is because
+ * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth
+ * check will prevent CPU hotplug from deactivating all CPUs in that domain.
+ */
+static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus)
+{
+ const struct cpumask *hk_msk;
+
+ hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ if (housekeeping_enabled(HK_TYPE_DOMAIN)) {
+ if (!cpumask_intersects(p->cpus_ptr, hk_msk)) {
+ /*
+ * CPUs isolated by isolcpu="domain" always belong to
+ * def_root_domain.
+ */
+ cpumask_andnot(cpus, cpu_active_mask, hk_msk);
+ return;
+ }
+ }
+
+ /*
+ * If a root domain holds a DL task, it must have active CPUs. So
+ * active CPUs can always be found by walking up the task's cpuset
+ * hierarchy up to the partition root.
+ */
+ cpuset_cpus_allowed_locked(p, cpus);
+}
+
+/* The caller should hold cpuset_mutex */
void dl_add_task_root_domain(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
struct dl_bw *dl_b;
+ unsigned int cpu;
+ struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
@@ -3129,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p)
return;
}
- rq = __task_rq_lock(p, &rf);
-
+ /*
+ * Get an active rq, whose rq->rd traces the correct root
+ * domain.
+ * Ideally this would be under cpuset reader lock until rq->rd is
+ * fetched. However, sleepable locks cannot nest inside pi_lock, so we
+ * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
+ * to guarantee the CPU stays in the cpuset.
+ */
+ dl_get_task_effective_cpus(p, msk);
+ cpu = cpumask_first_and(cpu_active_mask, msk);
+ BUG_ON(cpu >= nr_cpu_ids);
+ rq = cpu_rq(cpu);
dl_b = &rq->rd->dl_bw;
- raw_spin_lock(&dl_b->lock);
+ /* End of fetching rd */
+ raw_spin_lock(&dl_b->lock);
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
-
raw_spin_unlock(&dl_b->lock);
-
- task_rq_unlock(rq, p, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
void dl_clear_root_domain(struct root_domain *rd)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6827689a0966..05f5a49e9649 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static unsigned long scx_in_softlockup;
-static atomic_t scx_breather_depth = ATOMIC_INIT(0);
static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
+static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
/*
- * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence
* numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
* allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
* lazily when enabling and freed when disabling to avoid waste when sched_ext
* isn't active.
*/
-struct scx_kick_pseqs {
+struct scx_kick_syncs {
struct rcu_head rcu;
- unsigned long seqs[];
+ unsigned long syncs[];
};
-static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
+static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
/*
* Direct dispatch marker.
@@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
+/*
+ * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
+ * There usually is no reason to modify these as normal scheduler operation
+ * shouldn't be affected by them. The knobs are primarily for debugging.
+ */
+static u64 scx_slice_dfl = SCX_SLICE_DFL;
+static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
+
+static int set_slice_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
+}
+
+static const struct kernel_param_ops slice_us_param_ops = {
+ .set = set_slice_us,
+ .get = param_get_uint,
+};
+
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+ .set = set_bypass_lb_intv_us,
+ .get = param_get_uint,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "sched_ext."
+
+module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
+MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
+
+#undef MODULE_PARAM_PREFIX
+
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
+static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
-static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
-static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
enum scx_exit_kind kind, s64 exit_code,
const char *fmt, ...)
{
va_list args;
+ bool ret;
va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
+ ret = scx_vexit(sch, kind, exit_code, fmt, args);
va_end(args);
+
+ return ret;
}
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
}
/*
@@ -469,19 +522,16 @@ struct scx_task_iter {
* RCU read lock or obtaining a reference count.
*
* All tasks which existed when the iteration started are guaranteed to be
- * visited as long as they still exist.
+ * visited as long as they are not dead.
*/
static void scx_task_iter_start(struct scx_task_iter *iter)
{
- BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
- ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+ memset(iter, 0, sizeof(*iter));
raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked_task = NULL;
- iter->cnt = 0;
iter->list_locked = true;
}
@@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- __scx_task_iter_maybe_relock(iter);
-
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- __scx_task_iter_maybe_relock(iter);
}
+ __scx_task_iter_maybe_relock(iter);
+
list_for_each_entry(pos, cursor, tasks_node) {
if (&pos->tasks_node == &scx_tasks)
return NULL;
@@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err
static void run_deferred(struct rq *rq)
{
process_ddsp_deferred_locals(rq);
+
+ if (local_read(&rq->scx.reenq_local_deferred)) {
+ local_set(&rq->scx.reenq_local_deferred, 0);
+ reenq_local(rq);
+ }
}
static void deferred_bal_cb_workfn(struct rq *rq)
@@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work)
* schedule_deferred - Schedule execution of deferred actions on an rq
* @rq: target rq
*
- * Schedule execution of deferred actions on @rq. Must be called with @rq
- * locked. Deferred actions are executed with @rq locked but unpinned, and thus
- * can unlock @rq to e.g. migrate tasks to other rqs.
+ * Schedule execution of deferred actions on @rq. Deferred actions are executed
+ * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks
+ * to other rqs.
*/
static void schedule_deferred(struct rq *rq)
{
+ /*
+ * Queue an irq work. They are executed on IRQ re-enable which may take
+ * a bit longer than the scheduler hook in schedule_deferred_locked().
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * schedule_deferred_locked - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Equivalent to
+ * schedule_deferred() but requires @rq to be locked and can be more efficient.
+ */
+static void schedule_deferred_locked(struct rq *rq)
+{
lockdep_assert_rq_held(rq);
/*
@@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq)
}
/*
- * No scheduler hooks available. Queue an irq work. They are executed on
- * IRQ re-enable which may take a bit longer than the scheduler hooks.
- * The above WAKEUP and BALANCE paths should cover most of the cases and
- * the time to IRQ re-enable shouldn't be long.
+ * No scheduler hooks available. Use the generic irq_work path. The
+ * above WAKEUP and BALANCE paths should cover most of the cases and the
+ * time to IRQ re-enable shouldn't be long.
*/
- irq_work_queue(&rq->scx.deferred_irq_work);
+ schedule_deferred(rq);
}
/**
@@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
- p->scx.slice = SCX_SLICE_DFL;
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
@@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
!RB_EMPTY_NODE(&p->scx.dsq_priq));
if (!is_local) {
- raw_spin_lock(&dsq->lock);
+ raw_spin_lock_nested(&dsq->lock,
+ (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
@@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
container_of(rbp, struct task_struct,
scx.dsq_priq);
list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ /* first task unchanged - no update needed */
} else {
list_add(&p->scx.dsq_list.node, &dsq->list);
+ /* not builtin and new task is at head - use fastpath */
+ rcu_assign_pointer(dsq->first_task, p);
}
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
@@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
dsq->id);
- if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
list_add(&p->scx.dsq_list.node, &dsq->list);
- else
+ /* new task inserted at head - use fastpath */
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ } else {
+ bool was_empty;
+
+ was_empty = list_empty(&dsq->list);
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ }
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
@@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p,
list_del_init(&p->scx.dsq_list.node);
dsq_mod_nr(dsq, -1);
+
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
+ struct task_struct *first_task;
+
+ first_task = nldsq_next_task(dsq, NULL, false);
+ rcu_assign_pointer(dsq->first_task, first_task);
+ }
}
static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
struct scx_dispatch_q *dsq = p->scx.dsq;
bool is_local = dsq == &rq->scx.local_dsq;
+ lockdep_assert_rq_held(rq);
+
if (!dsq) {
/*
* If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
@@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
+/*
+ * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq
+ * and dsq are locked.
+ */
+static void dispatch_dequeue_locked(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_held(&dsq->lock);
+
+ task_unlink_from_dsq(p, dsq);
+ p->scx.dsq = NULL;
+}
+
static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
struct rq *rq, u64 dsq_id,
struct task_struct *p)
@@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
list_add_tail(&p->scx.dsq_list.node,
&rq->scx.ddsp_deferred_locals);
- schedule_deferred(rq);
+ schedule_deferred_locked(rq);
return;
}
@@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
{
struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
+ struct scx_dispatch_q *dsq;
unsigned long qseq;
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
@@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (scx_rq_bypassing(rq)) {
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
- goto global;
+ goto bypass;
}
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
direct:
direct_dispatch(sch, p, enq_flags);
return;
-
+local_norefill:
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+ return;
local:
+ dsq = &rq->scx.local_dsq;
+ goto enqueue;
+global:
+ dsq = find_global_dsq(sch, p);
+ goto enqueue;
+bypass:
+ dsq = &task_rq(p)->scx.bypass_dsq;
+ goto enqueue;
+
+enqueue:
/*
* For task-ordering, slice refill must be treated as implying the end
* of the current slice. Otherwise, the longer @p stays on the CPU, the
@@ -1293,14 +1412,7 @@ local:
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(sch, p);
-local_norefill:
- dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
- return;
-
-global:
- touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(sch, p);
- dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
+ dispatch_enqueue(sch, dsq, p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
* @p is going from a non-local DSQ to a non-local DSQ. As
* $src_dsq is already locked, do an abbreviated dequeue.
*/
- task_unlink_from_dsq(p, src_dsq);
- p->scx.dsq = NULL;
+ dispatch_dequeue_locked(p, src_dsq);
raw_spin_unlock(&src_dsq->lock);
dispatch_enqueue(sch, dst_dsq, p, enq_flags);
@@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
return dst_rq;
}
-/*
- * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
- * banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artificial delays while the
- * bypass mode is switching to guarantee timely completion.
- */
-static void scx_breather(struct rq *rq)
-{
- u64 until;
-
- lockdep_assert_rq_held(rq);
-
- if (likely(!atomic_read(&scx_breather_depth)))
- return;
-
- raw_spin_rq_unlock(rq);
-
- until = ktime_get_ns() + NSEC_PER_MSEC;
-
- do {
- int cnt = 1024;
- while (atomic_read(&scx_breather_depth) && --cnt)
- cpu_relax();
- } while (atomic_read(&scx_breather_depth) &&
- time_before64(ktime_get_ns(), until));
-
- raw_spin_rq_lock(rq);
-}
-
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_bypass() dequeueing
- * tasks from @dsq trying to put the system into the bypass mode. On
- * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
- * the machine into soft lockups. Give a breather.
- */
- scx_breather(rq);
-
- /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -1806,6 +1880,17 @@ retry:
nldsq_for_each_task(p, dsq) {
struct rq *task_rq = task_rq(p);
+ /*
+ * This loop can lead to multiple lockup scenarios, e.g. the BPF
+ * scheduler can put an enormous number of affinitized tasks into
+ * a contended DSQ, or the outer retry loop can repeatedly race
+ * against scx_bypass() dequeueing tasks from @dsq trying to put
+ * the system into the bypass mode. This can easily live-lock the
+ * machine. If aborting, exit from all non-bypass DSQs.
+ */
+ if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ break;
+
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
move_local_task_to_local_dsq(p, 0, dsq, rq);
@@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
- scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (scx_rq_bypassing(rq)) {
+ if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+ goto has_tasks;
+ else
+ goto no_tasks;
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
- /*
- * Pairs with the smp_load_acquire() issued by a CPU in
- * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
- * resched.
- */
- smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
struct scx_sched *sch = scx_root;
+
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+static struct task_struct *
+do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
{
struct task_struct *prev = rq->curr;
bool keep_prev, kick_idle = false;
struct task_struct *p;
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
rq_modified_clear(rq);
+
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
rq_repin_lock(rq, rf);
maybe_queue_balance_callback(rq);
- if (rq_modified_above(rq, &ext_sched_class))
+
+ /*
+ * If any higher-priority sched class enqueued a runnable task on
+ * this rq during balance_one(), abort and return RETRY_TASK, so
+ * that the scheduler loop can restart.
+ *
+ * If @force_scx is true, always try to pick a SCHED_EXT task,
+ * regardless of any higher-priority sched classes activity.
+ */
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return p;
}
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+{
+ return do_pick_task_scx(rq, rf, false);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
INIT_LIST_HEAD(&scx->runnable_node);
scx->runnable_at = jiffies;
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
- scx->slice = SCX_SLICE_DFL;
+ scx->slice = READ_ONCE(scx_slice_dfl);
}
void scx_pre_fork(struct task_struct *p)
@@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p)
percpu_up_read(&scx_fork_rwsem);
}
-void sched_ext_free(struct task_struct *p)
+void sched_ext_dead(struct task_struct *p)
{
unsigned long flags;
@@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg)
tg->scx.weight = CGROUP_WEIGHT_DFL;
tg->scx.bw_period_us = default_bw_period_us();
tg->scx.bw_quota_us = RUNTIME_INF;
+ tg->scx.idle = false;
}
int scx_tg_online(struct task_group *tg)
@@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- /* TODO: Implement ops->cgroup_set_idle() */
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
+ tg_cgrp(tg), idle);
+
+ /* Update the task group's idle state */
+ tg->scx.idle = idle;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_bandwidth(struct task_group *tg,
@@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
- * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ * handle_lockup - sched_ext common lockup handler
+ * @fmt: format string
*
- * While there are various reasons why RCU CPU stalls can occur on a system
- * that may not be caused by the current BPF scheduler, try kicking out the
- * current scheduler in an attempt to recover the system to a good state before
- * issuing panics.
+ * Called on system stall or lockup condition and initiates abort of sched_ext
+ * if enabled, which may resolve the reported lockup.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the lockup. %false if sched_ext is not enabled or abort was already
+ * initiated by someone else.
*/
-bool scx_rcu_cpu_stall(void)
+static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
{
struct scx_sched *sch;
+ va_list args;
+ bool ret;
- rcu_read_lock();
+ guard(rcu)();
sch = rcu_dereference(scx_root);
- if (unlikely(!sch)) {
- rcu_read_unlock();
+ if (unlikely(!sch))
return false;
- }
switch (scx_enable_state()) {
case SCX_ENABLING:
case SCX_ENABLED:
- break;
+ va_start(args, fmt);
+ ret = scx_verror(sch, fmt, args);
+ va_end(args);
+ return ret;
default:
- rcu_read_unlock();
return false;
}
+}
- scx_error(sch, "RCU CPU stall detected!");
- rcu_read_unlock();
-
- return true;
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
+ * else already initiated abort.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ return handle_lockup("RCU CPU stall detected!");
}
/**
@@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void)
* live-lock the system by making many CPUs target the same DSQ to the point
* where soft-lockup detection triggers. This function is called from
* soft-lockup watchdog when the triggering point is close and tries to unjam
- * the system by enabling the breather and aborting the BPF scheduler.
+ * the system and aborting the BPF scheduler.
*/
void scx_softlockup(u32 dur_s)
{
- struct scx_sched *sch;
+ if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
+ return;
- rcu_read_lock();
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
+ smp_processor_id(), dur_s);
+}
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- goto out_unlock;
+/**
+ * scx_hardlockup - sched_ext hardlockup handler
+ *
+ * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
+ * numerous affinitized tasks in a single queue and directing all CPUs at it.
+ * Try kicking out the current scheduler in an attempt to recover the system to
+ * a good state before taking more drastic actions.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * someone else already initiated abort.
+ */
+bool scx_hardlockup(int cpu)
+{
+ if (!handle_lockup("hard lockup - CPU %d", cpu))
+ return false;
- switch (scx_enable_state()) {
- case SCX_ENABLING:
- case SCX_ENABLED:
- break;
- default:
- goto out_unlock;
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+ return true;
+}
+
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+ struct cpumask *donee_mask, struct cpumask *resched_mask,
+ u32 nr_donor_target, u32 nr_donee_target)
+{
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct task_struct *p, *n;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+ u32 nr_balanced = 0, min_delta_us;
+
+ /*
+ * All we want to guarantee is reasonable forward progress. No reason to
+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
+ * consider offloading iff the total queued duration is over the
+ * threshold.
+ */
+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ return 0;
+
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ list_add(&cursor.node, &donor_dsq->list);
+resume:
+ n = container_of(&cursor, struct task_struct, scx.dsq_list);
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ while ((p = n)) {
+ struct rq *donee_rq;
+ struct scx_dispatch_q *donee_dsq;
+ int donee;
+
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ if (donor_dsq->nr <= nr_donor_target)
+ break;
+
+ if (cpumask_empty(donee_mask))
+ break;
+
+ donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+ if (donee >= nr_cpu_ids)
+ continue;
+
+ donee_rq = cpu_rq(donee);
+ donee_dsq = &donee_rq->scx.bypass_dsq;
+
+ /*
+ * $p's rq is not locked but $p's DSQ lock protects its
+ * scheduling properties making this test safe.
+ */
+ if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ continue;
+
+ /*
+ * Moving $p from one non-local DSQ to another. The source rq
+ * and DSQ are already locked. Do an abbreviated dequeue and
+ * then perform enqueue without unlocking $donor_dsq.
+ *
+ * We don't want to drop and reacquire the lock on each
+ * iteration as @donor_dsq can be very long and potentially
+ * highly contended. Donee DSQs are less likely to be contended.
+ * The nested locking is safe as only this LB moves tasks
+ * between bypass DSQs.
+ */
+ dispatch_dequeue_locked(p, donor_dsq);
+ dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+ /*
+ * $donee might have been idle and need to be woken up. No need
+ * to be clever. Kick every CPU that receives tasks.
+ */
+ cpumask_set_cpu(donee, resched_mask);
+
+ if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+ cpumask_clear_cpu(donee, donee_mask);
+
+ nr_balanced++;
+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+ list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+ cpu_relax();
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ goto resume;
+ }
}
- /* allow only one instance, cleared at the end of scx_bypass() */
- if (test_and_set_bit(0, &scx_in_softlockup))
- goto out_unlock;
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
- printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_root->ops.name);
+ return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+ const struct cpumask *node_mask = cpumask_of_node(node);
+ struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+ u32 nr_target, nr_donor_target;
+ u32 before_min = U32_MAX, before_max = 0;
+ u32 after_min = U32_MAX, after_max = 0;
+ int cpu;
+
+ /* count the target tasks and CPUs */
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ nr_tasks += nr;
+ nr_cpus++;
+
+ before_min = min(nr, before_min);
+ before_max = max(nr, before_max);
+ }
+
+ if (!nr_cpus)
+ return;
/*
- * Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to scx_bypass().
+ * We don't want CPUs to have more than $nr_donor_target tasks and
+ * balancing to fill donee CPUs upto $nr_target. Once targets are
+ * calculated, find the donee CPUs.
*/
- atomic_inc(&scx_breather_depth);
+ nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+ nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
- scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
-out_unlock:
- rcu_read_unlock();
+ cpumask_clear(donee_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ cpumask_set_cpu(cpu, donee_mask);
+ }
+
+ /* iterate !donee CPUs and see if they should be offloaded */
+ cpumask_clear(resched_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+ if (cpumask_empty(donee_mask))
+ break;
+ if (cpumask_test_cpu(cpu, donee_mask))
+ continue;
+ if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ continue;
+
+ nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_donor_target, nr_target);
+ }
+
+ for_each_cpu(cpu, resched_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ raw_spin_rq_lock_irq(rq);
+ resched_curr(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ after_min = min(nr, after_min);
+ after_max = max(nr, after_max);
+
+ }
+
+ trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+ before_min, before_max, after_min, after_max);
}
-static void scx_clear_softlockup(void)
+/*
+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
+ * outcomes, a simple load balancing mechanism is implemented by the following
+ * timer which runs periodically while bypass mode is in effect.
+ */
+static void scx_bypass_lb_timerfn(struct timer_list *timer)
{
- if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_breather_depth);
+ struct scx_sched *sch;
+ int node;
+ u32 intv_us;
+
+ sch = rcu_dereference_all(scx_root);
+ if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+ return;
+
+ for_each_node_with_cpus(node)
+ bypass_lb_node(sch, node);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us)
+ mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
}
+static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
@@ -3704,25 +4031,34 @@ static void scx_bypass(bool bypass)
sch = rcu_dereference_bh(scx_root);
if (bypass) {
- scx_bypass_depth++;
+ u32 intv_us;
+
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
+ WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
+ scx_bypass_lb_timer.expires =
+ jiffies + usecs_to_jiffies(intv_us);
+ add_timer_global(&scx_bypass_lb_timer);
+ }
} else {
- scx_bypass_depth--;
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
WARN_ON_ONCE(scx_bypass_depth < 0);
if (scx_bypass_depth != 0)
goto unlock;
+ WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_DURATION,
ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_breather_depth);
-
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -3778,10 +4114,8 @@ static void scx_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
- scx_clear_softlockup();
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -3834,24 +4168,17 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void free_kick_pseqs_rcu(struct rcu_head *rcu)
-{
- struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
-
- kvfree(pseqs);
-}
-
-static void free_kick_pseqs(void)
+static void free_kick_syncs(void)
{
int cpu;
for_each_possible_cpu(cpu) {
- struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
- struct scx_kick_pseqs *to_free;
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *to_free;
- to_free = rcu_replace_pointer(*pseqs, NULL, true);
+ to_free = rcu_replace_pointer(*ksyncs, NULL, true);
if (to_free)
- call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+ kvfree_rcu(to_free, rcu);
}
}
@@ -3876,6 +4203,7 @@ static void scx_disable_workfn(struct kthread_work *work)
/* guarantee forward progress by bypassing scx_ops */
scx_bypass(true);
+ WRITE_ONCE(scx_aborting, false);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING:
@@ -3920,8 +4248,7 @@ static void scx_disable_workfn(struct kthread_work *work)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
update_rq_clock(task_rq(p));
@@ -3989,7 +4316,7 @@ static void scx_disable_workfn(struct kthread_work *work)
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_kick_pseqs();
+ free_kick_syncs();
mutex_unlock(&scx_enable_mutex);
@@ -3998,9 +4325,24 @@ done:
scx_bypass(false);
}
-static void scx_disable(enum scx_exit_kind kind)
+static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
+ return false;
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Set the aborting
+ * flag to break potential live-lock scenarios, ensuring we can
+ * successfully reach scx_bypass().
+ */
+ WRITE_ONCE(scx_aborting, true);
+ return true;
+}
+
+static void scx_disable(enum scx_exit_kind kind)
+{
struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
@@ -4009,7 +4351,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
- atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
rcu_read_unlock();
@@ -4238,10 +4580,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
seq_buf_init(&ns, buf, avail);
dump_newline(&ns);
- dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
cpu, rq->scx.nr_running, rq->scx.flags,
rq->scx.cpu_released, rq->scx.ops_qseq,
- rq->scx.pnt_seq);
+ rq->scx.kick_sync);
dump_line(&ns, " curr=%s[%d] class=%ps",
rq->curr->comm, rq->curr->pid,
rq->curr->sched_class);
@@ -4325,15 +4667,14 @@ static void scx_error_irq_workfn(struct irq_work *irq_work)
kthread_queue_work(sch->helper, &sch->disable_work);
}
-static void scx_vexit(struct scx_sched *sch,
+static bool scx_vexit(struct scx_sched *sch,
enum scx_exit_kind kind, s64 exit_code,
const char *fmt, va_list args)
{
struct scx_exit_info *ei = sch->exit_info;
- int none = SCX_EXIT_NONE;
- if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
- return;
+ if (!scx_claim_exit(sch, kind))
+ return false;
ei->exit_code = exit_code;
#ifdef CONFIG_STACKTRACE
@@ -4350,9 +4691,10 @@ static void scx_vexit(struct scx_sched *sch,
ei->reason = scx_exit_reason(ei->kind);
irq_work_queue(&sch->error_irq_work);
+ return true;
}
-static int alloc_kick_pseqs(void)
+static int alloc_kick_syncs(void)
{
int cpu;
@@ -4361,19 +4703,19 @@ static int alloc_kick_pseqs(void)
* can exceed percpu allocator limits on large machines.
*/
for_each_possible_cpu(cpu) {
- struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
- struct scx_kick_pseqs *new_pseqs;
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *new_ksyncs;
- WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+ WARN_ON_ONCE(rcu_access_pointer(*ksyncs));
- new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!new_pseqs) {
- free_kick_pseqs();
+ new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!new_ksyncs) {
+ free_kick_syncs();
return -ENOMEM;
}
- rcu_assign_pointer(*pseqs, new_pseqs);
+ rcu_assign_pointer(*ksyncs, new_ksyncs);
}
return 0;
@@ -4460,7 +4802,7 @@ err_free_sch:
return ERR_PTR(ret);
}
-static void check_hotplug_seq(struct scx_sched *sch,
+static int check_hotplug_seq(struct scx_sched *sch,
const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -4477,8 +4819,11 @@ static void check_hotplug_seq(struct scx_sched *sch,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
"expected hotplug seq %llu did not match actual %llu",
ops->hotplug_seq, global_hotplug_seq);
+ return -EBUSY;
}
}
+
+ return 0;
}
static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
@@ -4505,6 +4850,9 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
+ if (ops->cpu_acquire || ops->cpu_release)
+ pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
+
return 0;
}
@@ -4529,14 +4877,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- ret = alloc_kick_pseqs();
+ ret = alloc_kick_syncs();
if (ret)
goto err_unlock;
sch = scx_alloc_and_add_sched(ops);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_free_pseqs;
+ goto err_free_ksyncs;
}
/*
@@ -4545,6 +4893,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
+ if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
+ WRITE_ONCE(scx_aborting, false);
atomic_long_set(&scx_nr_rejected, 0);
@@ -4580,7 +4930,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
- check_hotplug_seq(sch, ops);
+ ret = check_hotplug_seq(sch, ops);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
@@ -4697,21 +5051,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
- if (!tryget_task_struct(p))
+ if (scx_get_task_state(p) != SCX_TASK_READY)
continue;
if (old_class != new_class)
queue_flags |= DEQUEUE_CLASS;
scoped_guard (sched_change, p, queue_flags) {
- p->scx.slice = SCX_SLICE_DFL;
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
p->sched_class = new_class;
}
-
- put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4735,8 +5086,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
-err_free_pseqs:
- free_kick_pseqs();
+err_free_ksyncs:
+ free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -4953,6 +5304,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
+static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -4991,6 +5343,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
.cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
+ .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -5064,29 +5417,38 @@ static bool can_skip_idle_kick(struct rq *rq)
return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
}
-static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
{
struct rq *rq = cpu_rq(cpu);
struct scx_rq *this_scx = &this_rq->scx;
+ const struct sched_class *cur_class;
bool should_wait = false;
unsigned long flags;
raw_spin_rq_lock_irqsave(rq, flags);
+ cur_class = rq->curr->sched_class;
/*
* During CPU hotplug, a CPU may depend on kicking itself to make
- * forward progress. Allow kicking self regardless of online state.
+ * forward progress. Allow kicking self regardless of online state. If
+ * @cpu is running a higher class task, we have no control over @cpu.
+ * Skip kicking.
*/
- if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
+ if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
+ !sched_class_above(cur_class, &ext_sched_class)) {
if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
- if (rq->curr->sched_class == &ext_sched_class)
+ if (cur_class == &ext_sched_class)
rq->curr->scx.slice = 0;
cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
}
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
- pseqs[cpu] = rq->scx.pnt_seq;
- should_wait = true;
+ if (cur_class == &ext_sched_class) {
+ ksyncs[cpu] = rq->scx.kick_sync;
+ should_wait = true;
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
}
resched_curr(rq);
@@ -5118,20 +5480,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
struct scx_rq *this_scx = &this_rq->scx;
- struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
+ struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
bool should_wait = false;
- unsigned long *pseqs;
+ unsigned long *ksyncs;
s32 cpu;
- if (unlikely(!pseqs_pcpu)) {
- pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+ if (unlikely(!ksyncs_pcpu)) {
+ pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
return;
}
- pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+ ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
for_each_cpu(cpu, this_scx->cpus_to_kick) {
- should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
+ should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
@@ -5145,20 +5507,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
return;
for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
+ unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
- if (cpu != cpu_of(this_rq)) {
- /*
- * Pairs with smp_store_release() issued by this CPU in
- * switch_class() on the resched path.
- *
- * We busy-wait here to guarantee that no other task can
- * be scheduled on our core before the target CPU has
- * entered the resched path.
- */
- while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
- cpu_relax();
- }
+ /*
+ * Busy-wait until the task running at the time of kicking is no
+ * longer running. This can be used to implement e.g. core
+ * scheduling.
+ *
+ * smp_cond_load_acquire() pairs with store_releases in
+ * pick_task_scx() and put_prev_task_scx(). The former breaks
+ * the wait if SCX's scheduling path is entered even if the same
+ * task is picked subsequently. The latter is necessary to break
+ * the wait when $cpu is taken by a higher sched class.
+ */
+ if (cpu != cpu_of(this_rq))
+ smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
@@ -5257,6 +5620,7 @@ void __init init_sched_ext_class(void)
int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -5362,19 +5726,23 @@ __bpf_kfunc_start_defs();
* exhaustion. If zero, the current residual slice is maintained. If
* %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
* scx_bpf_kick_cpu() to trigger scheduling.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
*/
-__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
+__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
{
struct scx_sched *sch;
guard(rcu)();
sch = rcu_dereference(scx_root);
if (unlikely(!sch))
- return;
+ return false;
if (!scx_dsq_insert_preamble(sch, p, enq_flags))
- return;
+ return false;
if (slice)
p->scx.slice = slice;
@@ -5382,56 +5750,114 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
p->scx.slice = p->scx.slice ?: 1;
scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
+
+ return true;
+}
+
+/*
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
+{
+ scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
+}
+
+static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
+{
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ return false;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+
+ return true;
}
+struct scx_bpf_dsq_insert_vtime_args {
+ /* @p can't be packed together as KF_RCU is not transitive */
+ u64 dsq_id;
+ u64 slice;
+ u64 vtime;
+ u64 enq_flags;
+};
+
/**
- * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
* @p: task_struct to insert
- * @dsq_id: DSQ to insert into
- * @slice: duration @p can run for in nsecs, 0 to keep the current value
- * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
- * @enq_flags: SCX_ENQ_*
+ * @args: struct containing the rest of the arguments
+ * @args->dsq_id: DSQ to insert into
+ * @args->slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @args->enq_flags: SCX_ENQ_*
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
+ * as an inline wrapper in common.bpf.h.
*
- * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
- * Tasks queued into the priority queue are ordered by @vtime. All other aspects
- * are identical to scx_bpf_dsq_insert().
+ * Insert @p into the vtime priority queue of the DSQ identified by
+ * @args->dsq_id. Tasks queued into the priority queue are ordered by
+ * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
*
- * @vtime ordering is according to time_before64() which considers wrapping. A
- * numerically larger vtime may indicate an earlier position in the ordering and
- * vice-versa.
+ * @args->vtime ordering is according to time_before64() which considers
+ * wrapping. A numerically larger vtime may indicate an earlier position in the
+ * ordering and vice-versa.
*
* A DSQ can only be used as a FIFO or priority queue at any given time and this
* function must not be called on a DSQ which already has one or more FIFO tasks
* queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
* SCX_DSQ_GLOBAL) cannot be used as priority queues.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
*/
-__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
+__bpf_kfunc bool
+__scx_bpf_dsq_insert_vtime(struct task_struct *p,
+ struct scx_bpf_dsq_insert_vtime_args *args)
{
struct scx_sched *sch;
guard(rcu)();
+
sch = rcu_dereference(scx_root);
if (unlikely(!sch))
- return;
+ return false;
- if (!scx_dsq_insert_preamble(sch, p, enq_flags))
- return;
+ return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
+ args->vtime, args->enq_flags);
+}
- if (slice)
- p->scx.slice = slice;
- else
- p->scx.slice = p->scx.slice ?: 1;
+/*
+ * COMPAT: Will be removed in v6.23.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ struct scx_sched *sch;
- p->scx.dsq_vtime = vtime;
+ guard(rcu)();
- scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
@@ -5455,6 +5881,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
return false;
/*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q().
+ */
+ if (unlikely(READ_ONCE(scx_aborting)))
+ return false;
+
+ /*
* Can be called from either ops.dispatch() locking this_rq() or any
* context where no rq lock is held. If latter, lock @p's task_rq which
* we'll likely need anyway.
@@ -5474,13 +5907,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
raw_spin_rq_lock(src_rq);
}
- /*
- * If the BPF scheduler keeps calling this function repeatedly, it can
- * cause similar live-lock conditions as consume_dispatch_q(). Insert a
- * breather if necessary.
- */
- scx_breather(src_rq);
-
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -5685,8 +6111,9 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
* Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
* lock (e.g. BPF timers or SYSCALL programs).
*
- * Returns %true if @p has been consumed, %false if @p had already been consumed
- * or dequeued.
+ * Returns %true if @p has been consumed, %false if @p had already been
+ * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
+ * DSQ.
*/
__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
@@ -5738,32 +6165,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.set = &scx_kfunc_ids_dispatch,
};
-__bpf_kfunc_start_defs();
-
-/**
- * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
- *
- * Iterate over all of the tasks currently enqueued on the local DSQ of the
- * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
- * processed tasks. Can only be called from ops.cpu_release().
- */
-__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+static u32 reenq_local(struct rq *rq)
{
- struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
- struct rq *rq;
struct task_struct *p, *n;
- guard(rcu)();
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- return 0;
-
- if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
- return 0;
-
- rq = cpu_rq(smp_processor_id());
lockdep_assert_rq_held(rq);
/*
@@ -5800,6 +6207,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
return nr_enqueued;
}
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ *
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
+ * returning variant that can be called from anywhere.
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
+ return 0;
+
+ rq = cpu_rq(smp_processor_id());
+ lockdep_assert_rq_held(rq);
+
+ return reenq_local(rq);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
@@ -5872,6 +6310,34 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
+/**
+ * scx_bpf_task_set_slice - Set task's time slice
+ * @p: task of interest
+ * @slice: time slice to set in nsecs
+ *
+ * Set @p's time slice to @slice. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+{
+ p->scx.slice = slice;
+ return true;
+}
+
+/**
+ * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
+ * @p: task of interest
+ * @vtime: virtual time to set
+ *
+ * Set @p's virtual time to @vtime. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+{
+ p->scx.dsq_vtime = vtime;
+ return true;
+}
+
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
@@ -6029,6 +6495,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
sizeof(struct bpf_iter_scx_dsq));
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
/*
* next() and destroy() will be called regardless of the return value.
@@ -6047,9 +6515,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
if (!kit->dsq)
return -ENOENT;
- INIT_LIST_HEAD(&kit->cursor.node);
- kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
- kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+ kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
+ READ_ONCE(kit->dsq->seq));
return 0;
}
@@ -6123,6 +6590,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
kit->dsq = NULL;
}
+/**
+ * scx_bpf_dsq_peek - Lockless peek at the first element.
+ * @dsq_id: DSQ to examine.
+ *
+ * Read the first element in the DSQ. This is semantically equivalent to using
+ * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
+ * this provides only a point-in-time snapshot, and the contents may change
+ * by the time any subsequent locking operation reads the queue.
+ *
+ * Returns the pointer, or NULL indicates an empty queue OR internal error.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
+ scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (unlikely(!dsq)) {
+ scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ return rcu_dereference(dsq->first_task);
+}
+
__bpf_kfunc_end_defs();
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -6277,6 +6778,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
}
/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
+{
+ struct rq *rq;
+
+ guard(preempt)();
+
+ rq = this_rq();
+ local_set(&rq->scx.reenq_local_deferred, 1);
+ schedule_deferred(rq);
+}
+
+/**
* scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
* @cpu: CPU of interest
*
@@ -6677,15 +7196,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
@@ -6776,6 +7299,12 @@ static int __init scx_init(void)
return ret;
}
+ if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
+ !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ pr_err("sched_ext: Failed to allocate cpumasks\n");
+ return -ENOMEM;
+ }
+
return 0;
}
__initcall(scx_init);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index d2434c954848..3d9d404d5cd2 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
return prev_cpu;
}
+struct scx_bpf_select_cpu_and_args {
+ /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
+ s32 prev_cpu;
+ u64 wake_flags;
+ u64 flags;
+};
+
/**
- * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
- * prioritizing those in @cpus_allowed
+ * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
* @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
* @cpus_allowed: cpumask of allowed CPUs
- * @flags: %SCX_PICK_IDLE* flags
+ * @args: struct containing the rest of the arguments
+ * @args->prev_cpu: CPU @p was on previously
+ * @args->wake_flags: %SCX_WAKE_* flags
+ * @args->flags: %SCX_PICK_IDLE* flags
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
+ * as an inline wrapper in common.bpf.h.
*
* Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
* context such as a BPF test_run() call, as long as built-in CPU selection
* is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
* is set.
*
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
*
* Returns the selected idle CPU, which will be automatically awakened upon
* returning from ops.select_cpu() and can be used for direct dispatch, or
* a negative value if no idle CPU is available.
*/
+__bpf_kfunc s32
+__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
+ struct scx_bpf_select_cpu_and_args *args)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
+ cpus_allowed, args->flags);
+}
+
+/*
+ * COMPAT: Will be removed in v6.22.
+ */
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
@@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index b3617abed510..386c677e4c9a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -23,6 +23,11 @@ enum scx_consts {
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
SCX_TASK_ITER_BATCH = 32,
+
+ SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
+ SCX_BYPASS_LB_DONOR_PCT = 125,
+ SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
+ SCX_BYPASS_LB_BATCH = 256,
};
enum scx_exit_kind {
@@ -697,12 +702,23 @@ struct sched_ext_ops {
* 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
* interpreted in the same fashion and specifies how much @cgrp can
* burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
+ * interpretation of @period_us and burstiness is up to the BPF
* scheduler.
*/
void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
u64 period_us, u64 quota_us, u64 burst_us);
+ /**
+ * @cgroup_set_idle: A cgroup's idle state is being changed
+ * @cgrp: cgroup whose idle state is being updated
+ * @idle: whether the cgroup is entering or exiting idle state
+ *
+ * Update @cgrp's idle state to @idle. This callback is invoked when
+ * a cgroup transitions between idle and non-idle states, allowing the
+ * BPF scheduler to adjust its behavior accordingly.
+ */
+ void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -884,6 +900,10 @@ struct scx_sched {
struct scx_dispatch_q **global_dsqs;
struct scx_sched_pcpu __percpu *pcpu;
+ /*
+ * Updates to the following warned bitfields can race causing RMW issues
+ * but it doesn't really matter.
+ */
bool warned_zero_slice:1;
bool warned_deprecated_rq:1;
@@ -948,6 +968,7 @@ enum scx_enq_flags {
SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+ SCX_ENQ_NESTED = 1LLU << 58,
};
enum scx_deq_flags {
@@ -986,8 +1007,10 @@ enum scx_kick_flags {
SCX_KICK_PREEMPT = 1LLU << 1,
/*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
+ * The scx_bpf_kick_cpu() call will return after the current SCX task of
+ * the target CPU switches out. This can be used to implement e.g. core
+ * scheduling. This has no effect if the current task on the target CPU
+ * is not on SCX.
*/
SCX_KICK_WAIT = 1LLU << 2,
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 769d7b7990df..da46c3164537 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4034,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (child_cfs_rq_on_list(cfs_rq))
return false;
+ if (cfs_rq->tg_load_avg_contrib)
+ return false;
+
return true;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8590113e4a60..d30cca6870f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -803,10 +803,12 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
- unsigned long pnt_seq;
+ unsigned long kick_sync;
+ local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
+ struct scx_dispatch_q bypass_dsq;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
@@ -1165,7 +1167,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned long nr_uninterruptible;
#ifdef CONFIG_SCHED_PROXY_EXEC
struct task_struct __rcu *donor; /* Scheduling context */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cbf7206b3f9d..c903f1a42891 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags)
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
+ *
+ * In the SCHED_PROXY_EXECUTION case we may do sleeping
+ * dequeues that are not followed by a task switch, so check
+ * TSK_ONCPU is set to ensure the task switch is imminent.
+ * Otherwise clear the flags as usual.
*/
- if (flags & DEQUEUE_SLEEP)
+ if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU))
return;
/*
diff --git a/kernel/scs.c b/kernel/scs.c
index d7809affe740..772488afd5b9 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk)
if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
return;
- for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
+ for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) {
if (!READ_ONCE_NOCHECK(*p))
break;
used += sizeof(*p);
diff --git a/kernel/smp.c b/kernel/smp.c
index 02f52291fae4..f349960f79ca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1088,6 +1088,28 @@ void wake_up_all_idle_cpus(void)
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
/**
+ * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs
+ * @mask: The CPU mask for the CPUs to check.
+ *
+ * This function walks through the @mask to check if there are any pending IPIs
+ * scheduled, for any of the CPUs in the @mask. It does not guarantee
+ * correctness as it only provides a racy snapshot.
+ *
+ * Returns true if there is a pending IPI scheduled and false otherwise.
+ */
+bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu)))
+ return true;
+ }
+
+ return false;
+}
+
+/**
* struct smp_call_on_cpu_struct - Call a function on a specific CPU
* @work: &work_struct
* @done: &completion to signal
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb6196e3fa99..2cd767b9680e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -13,7 +13,6 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/initrd.h>
-#include <linux/times.h>
#include <linux/limits.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
@@ -55,7 +54,8 @@ static const int cap_last_cap = CAP_LAST_CAP;
* to the buffer.
*
* These write modes control how current file position affects the behavior of
- * updating sysctl values through the proc interface on each write.
+ * updating internal kernel (SYSCTL_USER_TO_KERN) sysctl values through the proc
+ * interface on each write.
*/
enum sysctl_writes_mode {
SYSCTL_WRITES_LEGACY = -1,
@@ -73,7 +73,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
#ifdef CONFIG_PROC_SYSCTL
-static int _proc_do_string(char *data, int maxlen, int write,
+static int _proc_do_string(char *data, int maxlen, int dir,
char *buffer, size_t *lenp, loff_t *ppos)
{
size_t len;
@@ -84,7 +84,7 @@ static int _proc_do_string(char *data, int maxlen, int write,
return 0;
}
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
/* Only continue writes not past the end of buffer. */
len = strlen(data);
@@ -172,7 +172,7 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -186,13 +186,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
*
* Returns 0 on success.
*/
-int proc_dostring(const struct ctl_table *table, int write,
+int proc_dostring(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
- if (write)
+ if (SYSCTL_USER_TO_KERN(dir))
proc_first_pos_non_zero_ignore(ppos, table);
- return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+ return _proc_do_string(table->data, table->maxlen, dir, buffer, lenp,
ppos);
}
@@ -354,74 +354,55 @@ static void proc_put_char(void **buf, size_t *size, char c)
}
}
-static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- WRITE_ONCE(*valp, -*lvalp);
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- WRITE_ONCE(*valp, *lvalp);
- }
- } else {
- int val = READ_ONCE(*valp);
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
- }
- return 0;
-}
+static SYSCTL_USER_TO_KERN_INT_CONV(, SYSCTL_CONV_IDENTITY)
+static SYSCTL_KERN_TO_USER_INT_CONV(, SYSCTL_CONV_IDENTITY)
+
+static SYSCTL_INT_CONV_CUSTOM(, sysctl_user_to_kern_int_conv,
+ sysctl_kern_to_user_int_conv, false)
+static SYSCTL_INT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_int_conv,
+ sysctl_kern_to_user_int_conv, true)
+
-static int do_proc_douintvec_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
+static SYSCTL_USER_TO_KERN_UINT_CONV(, SYSCTL_CONV_IDENTITY)
+
+int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr,
+ const unsigned int *k_ptr)
{
- if (write) {
- if (*lvalp > UINT_MAX)
- return -EINVAL;
- WRITE_ONCE(*valp, *lvalp);
- } else {
- unsigned int val = READ_ONCE(*valp);
- *lvalp = (unsigned long)val;
- }
+ unsigned int val = READ_ONCE(*k_ptr);
+ *u_ptr = (unsigned long)val;
return 0;
}
+static SYSCTL_UINT_CONV_CUSTOM(, sysctl_user_to_kern_uint_conv,
+ sysctl_kern_to_user_uint_conv, false)
+static SYSCTL_UINT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_uint_conv,
+ sysctl_kern_to_user_uint_conv, true)
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
- int write, void *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
+static int do_proc_dointvec(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *table))
{
int *i, vleft, first = 1, err = 0;
size_t left;
char *p;
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ if (!table->data || !table->maxlen || !*lenp ||
+ (*ppos && SYSCTL_KERN_TO_USER(dir))) {
*lenp = 0;
return 0;
}
- i = (int *) tbl_data;
+ i = (int *) table->data;
vleft = table->maxlen / sizeof(*i);
left = *lenp;
if (!conv)
- conv = do_proc_dointvec_conv;
+ conv = do_proc_int_conv;
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
@@ -434,7 +415,7 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
unsigned long lval;
bool neg;
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
proc_skip_spaces(&p, &left);
if (!left)
@@ -444,12 +425,12 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
sizeof(proc_wspace_sep), NULL);
if (err)
break;
- if (conv(&neg, &lval, i, 1, data)) {
+ if (conv(&neg, &lval, i, 1, table)) {
err = -EINVAL;
break;
}
} else {
- if (conv(&neg, &lval, i, 0, data)) {
+ if (conv(&neg, &lval, i, 0, table)) {
err = -EINVAL;
break;
}
@@ -459,11 +440,11 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
}
}
- if (!write && !first && left && !err)
+ if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
- if (write && !err && left)
+ if (SYSCTL_USER_TO_KERN(dir) && !err && left)
proc_skip_spaces(&p, &left);
- if (write && first)
+ if (SYSCTL_USER_TO_KERN(dir) && first)
return err ? : -EINVAL;
*lenp -= left;
out:
@@ -471,24 +452,11 @@ out:
return err;
}
-static int do_proc_dointvec(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_dointvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec_w(unsigned int *tbl_data,
- const struct ctl_table *table,
- void *buffer,
+static int do_proc_douintvec_w(const struct ctl_table *table, void *buffer,
size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+ int (*conv)(unsigned long *u_ptr,
+ unsigned int *k_ptr, int dir,
+ const struct ctl_table *table))
{
unsigned long lval;
int err = 0;
@@ -518,7 +486,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
goto out_free;
}
- if (conv(&lval, tbl_data, 1, data)) {
+ if (conv(&lval, (unsigned int *) table->data, 1, table)) {
err = -EINVAL;
goto out_free;
}
@@ -532,18 +500,16 @@ out_free:
return 0;
- /* This is in keeping with old __do_proc_dointvec() */
bail_early:
*ppos += *lenp;
return err;
}
-static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+static int do_proc_douintvec_r(const struct ctl_table *table, void *buffer,
size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+ int (*conv)(unsigned long *u_ptr,
+ unsigned int *k_ptr, int dir,
+ const struct ctl_table *table))
{
unsigned long lval;
int err = 0;
@@ -551,7 +517,7 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
left = *lenp;
- if (conv(&lval, tbl_data, 0, data)) {
+ if (conv(&lval, (unsigned int *) table->data, 0, table)) {
err = -EINVAL;
goto out;
}
@@ -569,23 +535,21 @@ out:
return err;
}
-static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table,
- int write, void *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+static int do_proc_douintvec(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *u_ptr,
+ unsigned int *k_ptr, int dir,
+ const struct ctl_table *table))
{
- unsigned int *i, vleft;
+ unsigned int vleft;
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ if (!table->data || !table->maxlen || !*lenp ||
+ (*ppos && SYSCTL_KERN_TO_USER(dir))) {
*lenp = 0;
return 0;
}
- i = (unsigned int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
+ vleft = table->maxlen / sizeof(unsigned int);
/*
* Arrays are not supported, keep this simple. *Do not* add
@@ -597,29 +561,26 @@ static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table,
}
if (!conv)
- conv = do_proc_douintvec_conv;
+ conv = do_proc_uint_conv;
- if (write)
- return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
- conv, data);
- return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+ if (SYSCTL_USER_TO_KERN(dir))
+ return do_proc_douintvec_w(table, buffer, lenp, ppos, conv);
+ return do_proc_douintvec_r(table, buffer, lenp, ppos, conv);
}
-int do_proc_douintvec(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+int proc_douintvec_conv(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *u_ptr, unsigned int *k_ptr,
+ int dir, const struct ctl_table *table))
{
- return __do_proc_douintvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
+ return do_proc_douintvec(table, dir, buffer, lenp, ppos, conv);
}
+
/**
* proc_dobool - read/write a bool
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -632,7 +593,7 @@ int do_proc_douintvec(const struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_dobool(const struct ctl_table *table, int write, void *buffer,
+int proc_dobool(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp;
@@ -648,10 +609,10 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer,
tmp.data = &val;
val = READ_ONCE(*data);
- res = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ res = proc_dointvec(&tmp, dir, buffer, lenp, ppos);
if (res)
return res;
- if (write)
+ if (SYSCTL_USER_TO_KERN(dir))
WRITE_ONCE(*data, val);
return 0;
}
@@ -659,7 +620,7 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer,
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -669,16 +630,16 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer,
*
* Returns 0 on success.
*/
-int proc_dointvec(const struct ctl_table *table, int write, void *buffer,
+int proc_dointvec(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+ return do_proc_dointvec(table, dir, buffer, lenp, ppos, NULL);
}
/**
* proc_douintvec - read a vector of unsigned integers
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -688,57 +649,17 @@ int proc_dointvec(const struct ctl_table *table, int write, void *buffer,
*
* Returns 0 on success.
*/
-int proc_douintvec(const struct ctl_table *table, int write, void *buffer,
+int proc_douintvec(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos)
{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
-}
-
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
- int *min;
- int *max;
-};
-
-static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- WRITE_ONCE(*valp, tmp);
- }
-
- return 0;
+ return do_proc_douintvec(table, dir, buffer, lenp, ppos,
+ do_proc_uint_conv);
}
/**
* proc_dointvec_minmax - read a vector of integers with min/max values
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -749,62 +670,20 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
- * Returns 0 on success or -EINVAL on write when the range check fails.
+ * Returns 0 on success or -EINVAL when the range check fails and
+ * SYSCTL_USER_TO_KERN(dir) == true
*/
-int proc_dointvec_minmax(const struct ctl_table *table, int write,
+int proc_dointvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_minmax_conv, &param);
-}
-
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
- unsigned int *min;
- unsigned int *max;
-};
-
-static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- int ret;
- unsigned int tmp;
- struct do_proc_douintvec_minmax_conv_param *param = data;
- /* write via temporary local uint for bounds-checking */
- unsigned int *up = write ? &tmp : valp;
-
- ret = do_proc_douintvec_conv(lvalp, up, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -ERANGE;
-
- WRITE_ONCE(*valp, tmp);
- }
-
- return 0;
+ return do_proc_dointvec(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_minmax);
}
/**
* proc_douintvec_minmax - read a vector of unsigned ints with min/max values
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -818,23 +697,20 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
* check for UINT_MAX to avoid having to support wrap around uses from
* userspace.
*
- * Returns 0 on success or -ERANGE on write when the range check fails.
+ * Returns 0 on success or -ERANGE when range check failes and
+ * SYSCTL_USER_TO_KERN(dir) == true
*/
-int proc_douintvec_minmax(const struct ctl_table *table, int write,
+int proc_douintvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = (unsigned int *) table->extra1,
- .max = (unsigned int *) table->extra2,
- };
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
+ return do_proc_douintvec(table, dir, buffer, lenp, ppos,
+ do_proc_uint_conv_minmax);
}
/**
* proc_dou8vec_minmax - read a vector of unsigned chars with min/max values
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -846,66 +722,64 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write,
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
- * Returns 0 on success or an error on write when the range check fails.
+ * Returns 0 on success or an error on SYSCTL_USER_TO_KERN(dir) == true
+ * and the range check fails.
*/
-int proc_dou8vec_minmax(const struct ctl_table *table, int write,
+int proc_dou8vec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp;
unsigned int min = 0, max = 255U, val;
u8 *data = table->data;
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = &min,
- .max = &max,
- };
int res;
/* Do not support arrays yet. */
if (table->maxlen != sizeof(u8))
return -EINVAL;
- if (table->extra1)
- min = *(unsigned int *) table->extra1;
- if (table->extra2)
- max = *(unsigned int *) table->extra2;
-
tmp = *table;
tmp.maxlen = sizeof(val);
tmp.data = &val;
+ if (!tmp.extra1)
+ tmp.extra1 = (unsigned int *) &min;
+ if (!tmp.extra2)
+ tmp.extra2 = (unsigned int *) &max;
+
val = READ_ONCE(*data);
- res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
+ res = do_proc_douintvec(&tmp, dir, buffer, lenp, ppos,
+ do_proc_uint_conv_minmax);
if (res)
return res;
- if (write)
+ if (SYSCTL_USER_TO_KERN(dir))
WRITE_ONCE(*data, val);
return 0;
}
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
-static int __do_proc_doulongvec_minmax(void *data,
- const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos,
- unsigned long convmul, unsigned long convdiv)
+static int do_proc_doulongvec_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
{
unsigned long *i, *min, *max;
int vleft, first = 1, err = 0;
size_t left;
char *p;
- if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ if (!table->data || !table->maxlen || !*lenp ||
+ (*ppos && SYSCTL_KERN_TO_USER(dir))) {
*lenp = 0;
return 0;
}
- i = data;
+ i = table->data;
min = table->extra1;
max = table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
left = *lenp;
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
@@ -917,7 +791,7 @@ static int __do_proc_doulongvec_minmax(void *data,
for (; left && vleft--; i++, first = 0) {
unsigned long val;
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
bool neg;
proc_skip_spaces(&p, &left);
@@ -946,11 +820,11 @@ static int __do_proc_doulongvec_minmax(void *data,
}
}
- if (!write && !first && left && !err)
+ if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
- if (write && !err)
+ if (SYSCTL_USER_TO_KERN(dir) && !err)
proc_skip_spaces(&p, &left);
- if (write && first)
+ if (SYSCTL_USER_TO_KERN(dir) && first)
return err ? : -EINVAL;
*lenp -= left;
out:
@@ -958,18 +832,18 @@ out:
return err;
}
-static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
- unsigned long convdiv)
+int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
{
- return __do_proc_doulongvec_minmax(table->data, table, write,
- buffer, lenp, ppos, convmul, convdiv);
+ return do_proc_doulongvec_minmax(table, dir, buffer, lenp, ppos,
+ convmul, convdiv);
}
/**
* proc_doulongvec_minmax - read a vector of long integers with min/max values
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -982,216 +856,24 @@ static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_doulongvec_minmax(const struct ctl_table *table, int write,
+int proc_doulongvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer,
- lenp, ppos, HZ, 1000l);
+ return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, 1l, 1l);
}
-
-static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
+int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *table))
{
- if (write) {
- if (*lvalp > INT_MAX / HZ)
- return 1;
- if (*negp)
- WRITE_ONCE(*valp, -*lvalp * HZ);
- else
- WRITE_ONCE(*valp, *lvalp * HZ);
- } else {
- int val = READ_ONCE(*valp);
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = lval / HZ;
- }
- return 0;
-}
-
-static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
- return 1;
- *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_clock_t(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
-
- if (jif > INT_MAX)
- return 1;
- WRITE_ONCE(*valp, (int)jif);
- } else {
- int val = READ_ONCE(*valp);
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_msecs(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp, int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- *valp = tmp;
- }
- return 0;
-}
-
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_jiffies_conv,NULL);
-}
-
-int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_minmax_conv, &param);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_userhz_jiffies_conv, NULL);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_conv, NULL);
+ return do_proc_dointvec(table, dir, buffer, lenp, ppos, conv);
}
/**
* proc_do_large_bitmap - read/write from/to a large bitmap
* @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
+ * @dir: %TRUE if this is a write to the sysctl file
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -1205,7 +887,7 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf
*
* Returns 0 on success.
*/
-int proc_do_large_bitmap(const struct ctl_table *table, int write,
+int proc_do_large_bitmap(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
@@ -1215,12 +897,12 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write,
unsigned long *tmp_bitmap = NULL;
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
- if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) {
*lenp = 0;
return 0;
}
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
char *p = buffer;
size_t skipped = 0;
@@ -1321,7 +1003,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write,
}
if (!err) {
- if (write) {
+ if (SYSCTL_USER_TO_KERN(dir)) {
if (*ppos)
bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
else
@@ -1337,85 +1019,70 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write,
#else /* CONFIG_PROC_SYSCTL */
-int proc_dostring(const struct ctl_table *table, int write,
+int proc_dostring(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dobool(const struct ctl_table *table, int write,
+int proc_dobool(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec(const struct ctl_table *table, int write,
+int proc_dointvec(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_douintvec(const struct ctl_table *table, int write,
+int proc_douintvec(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_minmax(const struct ctl_table *table, int write,
+int proc_dointvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_douintvec_minmax(const struct ctl_table *table, int write,
+int proc_douintvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dou8vec_minmax(const struct ctl_table *table, int write,
+int proc_dou8vec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_jiffies(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
+int proc_doulongvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
{
return -ENOSYS;
}
-int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *table))
{
return -ENOSYS;
}
-int proc_do_large_bitmap(const struct ctl_table *table, int write,
+int proc_do_large_bitmap(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
@@ -1424,7 +1091,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(const struct ctl_table *table, int write,
+int proc_do_static_key(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
@@ -1438,13 +1105,13 @@ int proc_do_static_key(const struct ctl_table *table, int write,
.extra2 = SYSCTL_ONE,
};
- if (write && !capable(CAP_SYS_ADMIN))
+ if (SYSCTL_USER_TO_KERN(dir) && !capable(CAP_SYS_ADMIN))
return -EPERM;
mutex_lock(&static_key_mutex);
val = static_key_enabled(key);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
+ ret = proc_dointvec_minmax(&tmp, dir, buffer, lenp, ppos);
+ if (SYSCTL_USER_TO_KERN(dir) && !ret) {
if (val)
static_key_enable(key);
else
@@ -1514,12 +1181,8 @@ int __init sysctl_init_bases(void)
EXPORT_SYMBOL(proc_dobool);
EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_douintvec);
-EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
-EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
-EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
EXPORT_SYMBOL(proc_do_large_bitmap);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 34eeacac2253..d31a6d40d38d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -99,3 +99,128 @@ void __init register_refined_jiffies(long cycles_per_second)
__clocksource_register(&refined_jiffies);
}
+
+#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
+#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
+
+static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ)
+static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ)
+static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t)
+static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs)
+
+static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz,
+ sysctl_kern_to_user_int_conv_hz, false)
+static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies,
+ sysctl_user_to_kern_int_conv_userhz,
+ sysctl_kern_to_user_int_conv_userhz, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
+ sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, true)
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
+ return -EINVAL;
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_userhz_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies_minmax);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos,
+ HZ, 1000l);
+}
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc1afec306b3..f39111830ca3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -296,6 +296,11 @@ int sched_clock_suspend(void)
return 0;
}
+static int sched_clock_syscore_suspend(void *data)
+{
+ return sched_clock_suspend();
+}
+
void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -305,14 +310,23 @@ void sched_clock_resume(void)
rd->read_sched_clock = cd.actual_read_sched_clock;
}
-static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+ sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+ .suspend = sched_clock_syscore_suspend,
+ .resume = sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+ .ops = &sched_clock_syscore_ops,
};
static int __init sched_clock_syscore_init(void)
{
- register_syscore_ops(&sched_clock_ops);
+ register_syscore(&sched_clock_syscore);
return 0;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4790da895203..3ec3daa4acab 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1994,6 +1994,11 @@ void timekeeping_resume(void)
timerfd_resume();
}
+static void timekeeping_syscore_resume(void *data)
+{
+ timekeeping_resume();
+}
+
int timekeeping_suspend(void)
{
struct timekeeper *tks = &tk_core.shadow_timekeeper;
@@ -2061,15 +2066,24 @@ int timekeeping_suspend(void)
return 0;
}
+static int timekeeping_syscore_suspend(void *data)
+{
+ return timekeeping_suspend();
+}
+
/* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_syscore_resume,
+ .suspend = timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+ .ops = &timekeeping_syscore_ops,
};
static int __init timekeeping_init_ops(void)
{
- register_syscore_ops(&timekeeping_syscore_ops);
+ register_syscore(&timekeeping_syscore);
return 0;
}
device_initcall(timekeeping_init_ops);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d2c79da81e4f..bfa2ec46e075 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
+config HAVE_DYNAMIC_FTRACE_WITH_JMP
+ bool
+ help
+ If the architecture supports to replace the __fentry__ with a
+ "jmp" instruction.
+
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -330,6 +336,26 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config DYNAMIC_FTRACE_WITH_JMP
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ depends on HAVE_DYNAMIC_FTRACE_WITH_JMP
+
+config FUNCTION_SELF_TRACING
+ bool "Function trace tracing code"
+ depends on FUNCTION_TRACER
+ help
+ Normally all the tracing code is set to notrace, where the function
+ tracer will ignore all the tracing functions. Sometimes it is useful
+ for debugging to trace some of the tracing infratructure itself.
+ Enable this to allow some of the tracing infrastructure to be traced
+ by the function tracer. Note, this will likely add noise to function
+ tracing if events and other tracing features are enabled along with
+ function tracing.
+
+ If unsure, say N.
+
config FPROBE
bool "Kernel Function Probe (fprobe)"
depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
@@ -575,6 +601,20 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
+config TRACE_SYSCALL_BUF_SIZE_DEFAULT
+ int "System call user read max size"
+ range 0 165
+ default 63
+ depends on FTRACE_SYSCALLS
+ help
+ Some system call trace events will record the data from a user
+ space address that one of the parameters point to. The amount of
+ data per event is limited. That limit is set by this config and
+ this config also affects how much user space data perf can read.
+
+ For a tracing instance, this size may be changed by writing into
+ its syscall_user_buf_size file.
+
config TRACER_SNAPSHOT
bool "Create a snapshot trace buffer"
select TRACER_MAX_TRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index dcb4e02afc5f..fc5dcc888e13 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o
endif
endif
+# Allow some files to be function traced
+ifdef CONFIG_FUNCTION_SELF_TRACING
+CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE)
+CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE)
+CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE)
+CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE)
+endif
+
ifdef CONFIG_FTRACE_STARTUP_TEST
CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE)
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6941145b5058..d031c8d80be4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -63,13 +63,116 @@ static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
+static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+
+{
+ /*
+ * These two are not needed in ftrace as they are in the
+ * generic trace_entry, filled by tracing_generic_entry_update,
+ * but for the trace_event->bin() synthesizer benefit we do it
+ * here too.
+ */
+ t->cpu = cpu;
+ t->pid = pid;
+
+ t->sector = sector;
+ t->bytes = bytes;
+ t->action = lower_32_bits(what);
+ t->device = dev;
+ t->error = error;
+ t->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
+}
+
+static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data,
+ int pdu_len)
+{
+ t2->pid = pid;
+ t2->cpu = cpu;
+
+ t2->sector = sector;
+ t2->bytes = bytes;
+ t2->action = what;
+ t2->device = dev;
+ t2->error = error;
+ t2->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace *t;
+ size_t trace_len = sizeof(*t) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector,
+ int bytes, u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ if (bt->version == 2)
+ return relay_blktrace_event2(bt, sequence, pid, cpu, sector,
+ bytes, what, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data,
+ pdu_len);
+}
+
/*
* Send out a notify message.
*/
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+static void trace_note(struct blk_trace *bt, pid_t pid, u64 action,
const void *data, size_t len, u64 cgid)
{
- struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
unsigned int trace_ctx = 0;
@@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0));
if (blk_tracer) {
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(*t) + cgid_len + len;
+
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
- goto record_it;
+ record_blktrace_event2(t, pid, cpu, 0, 0,
+ action, bt->dev, 0, cgid, cgid_len,
+ (void *)data, len);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
- if (t) {
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = ktime_to_ns(ktime_get());
-record_it:
- t->device = bt->dev;
- t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
- t->pid = pid;
- t->cpu = cpu;
- t->pdu_len = len + cgid_len;
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
-
- if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- }
+ relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid,
+ cgid_len, (void *)data, len);
}
/*
@@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt,
}
EXPORT_SYMBOL_GPL(__blk_trace_note_message);
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
@@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- const blk_opf_t opf, u32 what, int error,
+ const blk_opf_t opf, u64 what, int error,
int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
unsigned int trace_ctx = 0;
@@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
const enum req_op op = opf & REQ_OP_MASK;
+ size_t trace_len;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= MASK_TC_BIT(opf, META);
what |= MASK_TC_BIT(opf, PREFLUSH);
what |= MASK_TC_BIT(opf, FUA);
- if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
+
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
what |= BLK_TC_ACT(BLK_TC_DISCARD);
- if (op == REQ_OP_FLUSH)
+ break;
+ case REQ_OP_FLUSH:
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND);
+ break;
+ case REQ_OP_ZONE_RESET:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES);
+ break;
+ default:
+ break;
+ }
+
+ /* Drop trace events for zone operations with blktrace v1 */
+ if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) {
+ pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n",
+ (unsigned long long)what);
+ return;
+ }
+
if (cgid)
what |= __BLK_TA_CGROUP;
@@ -255,13 +387,68 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
+ switch (bt->version) {
+ case 1:
+ trace_len = sizeof(struct blk_io_trace);
+ break;
+ case 2:
+ default:
+ /*
+ * ftrace always uses v2 (blk_io_trace2) format.
+ *
+ * For sysfs-enabled tracing path (enabled via
+ * /sys/block/DEV/trace/enable), blk_trace_setup_queue()
+ * never initializes bt->version, leaving it 0 from
+ * kzalloc(). We must handle version==0 safely here.
+ *
+ * Fall through to default to ensure we never hit the
+ * old bug where default set trace_len=0, causing
+ * buffer underflow and memory corruption.
+ *
+ * Always use v2 format for ftrace and normalize
+ * bt->version to 2 when uninitialized.
+ */
+ trace_len = sizeof(struct blk_io_trace2);
+ if (bt->version == 0)
+ bt->version = 2;
+ break;
+ }
+ trace_len += pdu_len + cgid_len;
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
- t = ring_buffer_event_data(event);
- goto record_it;
+
+ switch (bt->version) {
+ case 1:
+ record_blktrace_event(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ case 2:
+ default:
+ /*
+ * Use v2 recording function (record_blktrace_event2)
+ * which writes blk_io_trace2 structure with correct
+ * field layout:
+ * - 32-bit pid at offset 28
+ * - 64-bit action at offset 32
+ *
+ * Fall through to default handles version==0 case
+ * (from sysfs path), ensuring we always use correct
+ * v2 recording function to match the v2 buffer
+ * allocated above.
+ */
+ record_blktrace_event2(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ }
+
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (unlikely(tsk->btrace_seq != blktrace_seq))
@@ -273,41 +460,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
- if (t) {
- sequence = per_cpu_ptr(bt->sequence, cpu);
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->sequence = ++(*sequence);
- t->time = ktime_to_ns(ktime_get());
-record_it:
- /*
- * These two are not needed in ftrace as they are in the
- * generic trace_entry, filled by tracing_generic_entry_update,
- * but for the trace_event->bin() synthesizer benefit we do it
- * here too.
- */
- t->cpu = cpu;
- t->pid = pid;
-
- t->sector = sector;
- t->bytes = bytes;
- t->action = what;
- t->device = bt->dev;
- t->error = error;
- t->pdu_len = pdu_len + cgid_len;
-
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- if (pdu_len)
- memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
-
- if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- return;
- }
- }
-
+ sequence = per_cpu_ptr(bt->sequence, cpu);
+ (*sequence)++;
+ relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data, pdu_len);
local_irq_restore(flags);
}
@@ -494,9 +650,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q,
+ char *name, dev_t dev,
+ u32 buf_size, u32 buf_nr,
+ struct block_device *bdev)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -504,31 +661,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
lockdep_assert_held(&q->debugfs_mutex);
- if (!buts->buf_size || !buts->buf_nr)
- return -EINVAL;
-
- strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
-
- /*
- * some device names have larger paths - convert the slashes
- * to underscores for this to work as expected
- */
- strreplace(buts->name, '/', '_');
-
/*
* bdev can be NULL, as with scsi-generic, this is a helpful as
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
- pr_warn("Concurrent blktraces are not allowed on %s\n",
- buts->name);
- return -EBUSY;
+ pr_warn("Concurrent blktraces are not allowed on %s\n", name);
+ return ERR_PTR(-EBUSY);
}
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
bt->sequence = alloc_percpu(unsigned long);
@@ -548,7 +693,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (bdev && !bdev_is_partition(bdev))
dir = q->debugfs_dir;
else
- bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root);
/*
* As blktrace relies on debugfs for its interface the debugfs directory
@@ -556,8 +701,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* files or directories.
*/
if (IS_ERR_OR_NULL(dir)) {
- pr_warn("debugfs_dir not present for %s so skipping\n",
- buts->name);
+ pr_warn("debugfs_dir not present for %s so skipping\n", name);
ret = -ENOENT;
goto err;
}
@@ -569,17 +713,40 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- bt->rchan = relay_open("trace", dir, buts->buf_size,
- buts->buf_nr, &blk_relay_callbacks, bt);
+ bt->rchan = relay_open("trace", dir, buf_size, buf_nr,
+ &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
+ blk_trace_setup_lba(bt, bdev);
+
+ return bt;
+
+err:
+ blk_trace_free(q, bt);
+
+ return ERR_PTR(ret);
+}
+
+static void blk_trace_setup_finalize(struct request_queue *q,
+ char *name, int version,
+ struct blk_trace *bt,
+ struct blk_user_trace_setup2 *buts)
+
+{
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2);
+
+ /*
+ * some device names have larger paths - convert the slashes
+ * to underscores for this to work as expected
+ */
+ strreplace(buts->name, '/', '_');
+
+ bt->version = version;
bt->act_mask = buts->act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
- blk_trace_setup_lba(bt, bdev);
-
/* overwrite with user settings */
if (buts->start_lba)
bt->start_lba = buts->start_lba;
@@ -591,30 +758,43 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
-
- ret = 0;
-err:
- if (ret)
- blk_trace_free(q, bt);
- return ret;
}
int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
char __user *arg)
{
+ struct blk_user_trace_setup2 buts2;
struct blk_user_trace_setup buts;
+ struct blk_trace *bt;
int ret;
ret = copy_from_user(&buts, arg, sizeof(buts));
if (ret)
return -EFAULT;
+ if (!buts.buf_size || !buts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
+ .act_mask = buts.act_mask,
+ .buf_size = buts.buf_size,
+ .buf_nr = buts.buf_nr,
+ .start_lba = buts.start_lba,
+ .end_lba = buts.end_lba,
+ .pid = buts.pid,
+ };
+
mutex_lock(&q->debugfs_mutex);
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
+ strcpy(buts.name, buts2.name);
mutex_unlock(&q->debugfs_mutex);
- if (ret)
- return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
blk_trace_remove(q);
@@ -624,19 +804,54 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
EXPORT_SYMBOL_GPL(blk_trace_setup);
+static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
+{
+ struct blk_user_trace_setup2 buts2;
+ struct blk_trace *bt;
+
+ if (copy_from_user(&buts2, arg, sizeof(buts2)))
+ return -EFAULT;
+
+ if (!buts2.buf_size || !buts2.buf_nr)
+ return -EINVAL;
+
+ if (buts2.flags != 0)
+ return -EINVAL;
+
+ mutex_lock(&q->debugfs_mutex);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 2, bt, &buts2);
+ mutex_unlock(&q->debugfs_mutex);
+
+ if (copy_to_user(arg, &buts2, sizeof(buts2))) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+ return 0;
+}
+
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
dev_t dev, struct block_device *bdev,
char __user *arg)
{
- struct blk_user_trace_setup buts;
+ struct blk_user_trace_setup2 buts2;
struct compat_blk_user_trace_setup cbuts;
- int ret;
+ struct blk_trace *bt;
if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
return -EFAULT;
- buts = (struct blk_user_trace_setup) {
+ if (!cbuts.buf_size || !cbuts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
.act_mask = cbuts.act_mask,
.buf_size = cbuts.buf_size,
.buf_nr = cbuts.buf_nr,
@@ -646,12 +861,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
};
mutex_lock(&q->debugfs_mutex);
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
mutex_unlock(&q->debugfs_mutex);
- if (ret)
- return ret;
- if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
+ if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) {
blk_trace_remove(q);
return -EFAULT;
}
@@ -707,6 +926,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
char b[BDEVNAME_SIZE];
switch (cmd) {
+ case BLKTRACESETUP2:
+ snprintf(b, sizeof(b), "%pg", bdev);
+ ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg);
+ break;
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
@@ -794,7 +1017,7 @@ blk_trace_request_get_cgid(struct request *rq)
*
**/
static void blk_add_trace_rq(struct request *rq, blk_status_t error,
- unsigned int nr_bytes, u32 what, u64 cgid)
+ unsigned int nr_bytes, u64 what, u64 cgid)
{
struct blk_trace *bt;
@@ -846,6 +1069,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
blk_trace_request_get_cgid(rq));
}
+static void blk_add_trace_zone_update_request(void *ignore, struct request *rq)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(rq->q->blk_trace);
+ if (likely(!bt) || bt->version < 2) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND,
+ blk_trace_request_get_cgid(rq));
+}
+
/**
* blk_add_trace_bio - Add a trace for a bio oriented action
* @q: queue the io is for
@@ -858,7 +1097,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u64 what, int error)
{
struct blk_trace *bt;
@@ -924,7 +1163,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
- u32 what;
+ u64 what;
if (explicit)
what = BLK_TA_UNPLUG_IO;
@@ -936,6 +1175,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
rcu_read_unlock();
}
+static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_PLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+
+ return;
+}
+
+static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+ return;
+}
+
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
@@ -1076,6 +1346,15 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
+ ret = register_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
+ WARN_ON(ret);
+ ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug,
+ NULL);
+ WARN_ON(ret);
+ ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug,
+ NULL);
+ WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1095,6 +1374,10 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL);
+ unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL);
+ unregister_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@ -1113,7 +1396,7 @@ static void blk_unregister_tracepoints(void)
* struct blk_io_tracer formatting routines
*/
-static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t)
{
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
@@ -1128,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
if (tc & BLK_TC_DISCARD)
rwbs[i++] = 'D';
- else if (tc & BLK_TC_WRITE)
+ else if (tc & BLK_TC_WRITE_ZEROES) {
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ } else if (tc & BLK_TC_WRITE)
rwbs[i++] = 'W';
else if (t->bytes)
rwbs[i++] = 'R';
@@ -1148,9 +1434,9 @@ out:
}
static inline
-const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent)
{
- return (const struct blk_io_trace *)ent;
+ return (const struct blk_io_trace2 *)ent;
}
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
@@ -1209,7 +1495,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
unsigned long long ts = iter->ts;
unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
unsigned secs = (unsigned long)ts;
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
@@ -1223,7 +1509,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
bool has_cg)
{
char rwbs[RWBS_LEN];
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
if (has_cg) {
@@ -1444,7 +1730,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- const struct blk_io_trace *t;
+ const struct blk_io_trace2 *t;
u16 what;
bool long_act;
blk_log_action_t *log_action;
@@ -1452,7 +1738,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
t = te_blk_io_trace(iter->ent);
what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
- long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
+ long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE));
log_action = classic ? &blk_log_action_classic : &blk_log_action;
has_cg = t->action & __BLK_TA_CGROUP;
@@ -1481,8 +1767,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
- struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
- const int offset = offsetof(struct blk_io_trace, sector);
+ struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent;
+ const int offset = offsetof(struct blk_io_trace2, sector);
struct blk_io_trace old = {
.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
.time = iter->ts,
@@ -1517,9 +1803,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
if (set)
- tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO);
else
- tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags |= TRACE_ITER(CONTEXT_INFO);
}
return 0;
}
@@ -1559,6 +1845,10 @@ static int __init init_blk_tracer(void)
return 1;
}
+ BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
+ __alignof__(long));
+ BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long));
+
return 0;
}
@@ -1667,6 +1957,7 @@ static const struct {
{ BLK_TC_DISCARD, "discard" },
{ BLK_TC_DRV_DATA, "drv_data" },
{ BLK_TC_FUA, "fua" },
+ { BLK_TC_WRITE_ZEROES, "write-zeroes" },
};
static int blk_trace_str2mask(const char *str)
@@ -1880,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'Z';
rwbs[i++] = 'C';
break;
+ case REQ_OP_WRITE_ZEROES:
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ break;
default:
rwbs[i++] = 'N';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4f87c16d915a..d57727abaade 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return run_ctx->entry_ip;
}
-static int
+static __always_inline int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
unsigned long entry_ip, struct ftrace_regs *fregs,
bool is_return, void *data)
@@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
- u32 chunk_sz, off;
+ u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
char buf[256];
@@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return -E2BIG;
for (off = 0; off < size; off += chunk_sz - 1) {
- chunk_sz = min_t(u32, sizeof(buf), size - off);
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
/* Expect str_copy_fn to return count of copied bytes, including
* zero terminator. Next iteration increment off by chunk_sz - 1 to
* overwrite NUL.
@@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return off;
}
-static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
- u32 size, const void *unsafe_src,
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
+ u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
- u32 off, chunk_sz;
+ u64 off, chunk_sz;
int err;
dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
@@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32
return -E2BIG;
for (off = 0; off < size; off += chunk_sz) {
- chunk_sz = min_t(u32, sizeof(buf), size - off);
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
if (err)
return err;
@@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, tsk);
}
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 484ad7a18463..cc48d16be43e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -163,7 +163,7 @@ enum {
#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
/*
- * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * Each fgraph_ops has a reserved unsigned long at the end (top) of the
* ret_stack to store task specific state.
*/
#define SHADOW_STACK_TASK_VARS(ret_stack) \
@@ -498,9 +498,6 @@ found:
return get_data_type_data(current, offset);
}
-/* Both enabled by default (can be cleared by function_graph tracer flags */
-bool fgraph_sleep_time = true;
-
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* archs can override this function if they must do something
@@ -1019,15 +1016,11 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops,
mutex_init(&dst_ops->local_hash.regex_lock);
INIT_LIST_HEAD(&dst_ops->subop_list);
dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ dst_ops->private = src_ops->private;
}
#endif
}
-void ftrace_graph_sleep_time_control(bool enable)
-{
- fgraph_sleep_time = enable;
-}
-
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
@@ -1098,7 +1091,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
* Does the user want to count the time a function was asleep.
* If so, do not update the time stamps.
*/
- if (fgraph_sleep_time)
+ if (!fgraph_no_sleep_time)
return;
timestamp = trace_clock_local();
@@ -1376,6 +1369,13 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ftrace_graph_active++;
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_pids_enabled(&gops->ops))
+ gops->entryfunc = fgraph_pid_func;
+#endif
+
if (ftrace_graph_active == 2)
ftrace_graph_disable_direct(true);
@@ -1395,8 +1395,6 @@ int register_ftrace_graph(struct fgraph_ops *gops)
} else {
init_task_vars(gops->idx);
}
- /* Always save the function, and reset at unregistering */
- gops->saved_func = gops->entryfunc;
gops->ops.flags |= FTRACE_OPS_FL_GRAPH;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 5a807d62e76d..1188eefef07c 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -10,6 +10,7 @@
#include <linux/kprobes.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/rhashtable.h>
#include <linux/slab.h>
#include <linux/sort.h>
@@ -29,7 +30,7 @@
* fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
* exists. The key is the address of fprobe instance.
* fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
- * instance related to the funciton address. The key is the ftrace IP
+ * instance related to the function address. The key is the ftrace IP
* address.
*
* When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
@@ -41,60 +42,68 @@
* - RCU hlist traversal under disabling preempt
*/
static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
-static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static struct rhltable fprobe_ip_table;
static DEFINE_MUTEX(fprobe_mutex);
+static struct fgraph_ops fprobe_graph_ops;
-/*
- * Find first fprobe in the hlist. It will be iterated twice in the entry
- * probe, once for correcting the total required size, the second time is
- * calling back the user handlers.
- * Thus the hlist in the fprobe_table must be sorted and new probe needs to
- * be added *before* the first fprobe.
- */
-static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed)
{
- struct fprobe_hlist_node *node;
- struct hlist_head *head;
+ return hash_ptr(*(unsigned long **)data, 32);
+}
- head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
- hlist_for_each_entry_rcu(node, head, hlist,
- lockdep_is_held(&fprobe_mutex)) {
- if (node->addr == ip)
- return node;
- }
- return NULL;
+static int fprobe_node_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ unsigned long key = *(unsigned long *)arg->key;
+ const struct fprobe_hlist_node *n = ptr;
+
+ return n->addr != key;
}
-NOKPROBE_SYMBOL(find_first_fprobe_node);
-/* Node insertion and deletion requires the fprobe_mutex */
-static void insert_fprobe_node(struct fprobe_hlist_node *node)
+static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed)
{
- unsigned long ip = node->addr;
- struct fprobe_hlist_node *next;
- struct hlist_head *head;
+ const struct fprobe_hlist_node *n = data;
+
+ return hash_ptr((void *)n->addr, 32);
+}
+
+static const struct rhashtable_params fprobe_rht_params = {
+ .head_offset = offsetof(struct fprobe_hlist_node, hlist),
+ .key_offset = offsetof(struct fprobe_hlist_node, addr),
+ .key_len = sizeof_field(struct fprobe_hlist_node, addr),
+ .hashfn = fprobe_node_hashfn,
+ .obj_hashfn = fprobe_node_obj_hashfn,
+ .obj_cmpfn = fprobe_node_cmp,
+ .automatic_shrinking = true,
+};
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node)
+{
lockdep_assert_held(&fprobe_mutex);
- next = find_first_fprobe_node(ip);
- if (next) {
- hlist_add_before_rcu(&node->hlist, &next->hlist);
- return;
- }
- head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
- hlist_add_head_rcu(&node->hlist, head);
+ return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
}
/* Return true if there are synonims */
static bool delete_fprobe_node(struct fprobe_hlist_node *node)
{
lockdep_assert_held(&fprobe_mutex);
+ bool ret;
/* Avoid double deleting */
if (READ_ONCE(node->fp) != NULL) {
WRITE_ONCE(node->fp, NULL);
- hlist_del_rcu(&node->hlist);
+ rhltable_remove(&fprobe_ip_table, &node->hlist,
+ fprobe_rht_params);
}
- return !!find_first_fprobe_node(node->addr);
+
+ rcu_read_lock();
+ ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr,
+ fprobe_rht_params);
+ rcu_read_unlock();
+
+ return ret;
}
/* Check existence of the fprobe */
@@ -246,12 +255,128 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
return ret;
}
-static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
- struct ftrace_regs *fregs)
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS)
+/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
+static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_hlist_node *node;
+ struct rhlist_head *head, *pos;
+ struct fprobe *fp;
+ int bit;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ /*
+ * ftrace_test_recursion_trylock() disables preemption, but
+ * rhltable_lookup() checks whether rcu_read_lcok is held.
+ * So we take rcu_read_lock() here.
+ */
+ rcu_read_lock();
+ head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
+
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
+ continue;
+
+ if (fprobe_shared_with_kprobes(fp))
+ __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ else
+ __fprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ }
+ rcu_read_unlock();
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_ftrace_entry);
+
+static struct ftrace_ops fprobe_ftrace_ops = {
+ .func = fprobe_ftrace_entry,
+ .flags = FTRACE_OPS_FL_SAVE_ARGS,
+};
+static int fprobe_ftrace_active;
+
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_ftrace_active) {
+ ret = register_ftrace_function(&fprobe_ftrace_ops);
+ if (ret) {
+ ftrace_free_filter(&fprobe_ftrace_ops);
+ return ret;
+ }
+ }
+ fprobe_ftrace_active++;
+ return 0;
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ fprobe_ftrace_active--;
+ if (!fprobe_ftrace_active)
+ unregister_ftrace_function(&fprobe_ftrace_ops);
+ if (num)
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
+}
+
+static bool fprobe_is_ftrace(struct fprobe *fp)
+{
+ return !fp->exit_handler;
+}
+
+#ifdef CONFIG_MODULES
+static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
+ int reset)
+{
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset);
+}
+#endif
+#else
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+ return -ENOENT;
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+}
+
+static bool fprobe_is_ftrace(struct fprobe *fp)
+{
+ return false;
+}
+
+#ifdef CONFIG_MODULES
+static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
+ int reset)
+{
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+}
+#endif
+#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
+/* fgraph_ops callback, this processes fprobes which have exit_handler. */
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct fprobe_hlist_node *node, *first;
unsigned long *fgraph_data = NULL;
unsigned long func = trace->func;
+ struct fprobe_hlist_node *node;
+ struct rhlist_head *head, *pos;
unsigned long ret_ip;
int reserved_words;
struct fprobe *fp;
@@ -260,14 +385,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
if (WARN_ON_ONCE(!fregs))
return 0;
- first = node = find_first_fprobe_node(func);
- if (unlikely(!first))
- return 0;
-
+ guard(rcu)();
+ head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params);
reserved_words = 0;
- hlist_for_each_entry_from_rcu(node, hlist) {
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
if (node->addr != func)
- break;
+ continue;
fp = READ_ONCE(node->fp);
if (!fp || !fp->exit_handler)
continue;
@@ -278,15 +401,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
reserved_words +=
FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
}
- node = first;
if (reserved_words) {
fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
if (unlikely(!fgraph_data)) {
- hlist_for_each_entry_from_rcu(node, hlist) {
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
if (node->addr != func)
- break;
+ continue;
fp = READ_ONCE(node->fp);
- if (fp && !fprobe_disabled(fp))
+ if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp))
fp->nmissed++;
}
return 0;
@@ -299,14 +421,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
*/
ret_ip = ftrace_regs_get_return_address(fregs);
used = 0;
- hlist_for_each_entry_from_rcu(node, hlist) {
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
int data_size;
void *data;
if (node->addr != func)
- break;
+ continue;
fp = READ_ONCE(node->fp);
- if (!fp || fprobe_disabled(fp))
+ if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp)))
continue;
data_size = fp->entry_data_size;
@@ -334,7 +456,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
/* If any exit_handler is set, data must be used. */
return used != 0;
}
-NOKPROBE_SYMBOL(fprobe_entry);
+NOKPROBE_SYMBOL(fprobe_fgraph_entry);
static void fprobe_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops,
@@ -373,7 +495,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
NOKPROBE_SYMBOL(fprobe_return);
static struct fgraph_ops fprobe_graph_ops = {
- .entryfunc = fprobe_entry,
+ .entryfunc = fprobe_fgraph_entry,
.retfunc = fprobe_return,
};
static int fprobe_graph_active;
@@ -449,25 +571,18 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
return 0;
}
-static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
- struct fprobe_addr_list *alist)
+static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+ struct fprobe_addr_list *alist)
{
- struct fprobe_hlist_node *node;
- int ret = 0;
-
- hlist_for_each_entry_rcu(node, head, hlist,
- lockdep_is_held(&fprobe_mutex)) {
- if (!within_module(node->addr, mod))
- continue;
- if (delete_fprobe_node(node))
- continue;
- /*
- * If failed to update alist, just continue to update hlist.
- * Therefore, at list user handler will not hit anymore.
- */
- if (!ret)
- ret = fprobe_addr_list_add(alist, node->addr);
- }
+ if (!within_module(node->addr, mod))
+ return;
+ if (delete_fprobe_node(node))
+ return;
+ /*
+ * If failed to update alist, just continue to update hlist.
+ * Therefore, at list user handler will not hit anymore.
+ */
+ fprobe_addr_list_add(alist, node->addr);
}
/* Handle module unloading to manage fprobe_ip_table. */
@@ -475,8 +590,9 @@ static int fprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
{
struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+ struct fprobe_hlist_node *node;
+ struct rhashtable_iter iter;
struct module *mod = data;
- int i;
if (val != MODULE_STATE_GOING)
return NOTIFY_DONE;
@@ -487,12 +603,19 @@ static int fprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
mutex_lock(&fprobe_mutex);
- for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
- fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
+ rhltable_walk_enter(&fprobe_ip_table, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
+ fprobe_remove_node_in_module(mod, node, &alist);
+
+ rhashtable_walk_stop(&iter);
+ } while (node == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
if (alist.index > 0)
- ftrace_set_filter_ips(&fprobe_graph_ops.ops,
- alist.addrs, alist.index, 1, 0);
+ fprobe_set_ips(alist.addrs, alist.index, 1, 0);
mutex_unlock(&fprobe_mutex);
kfree(alist.addrs);
@@ -725,11 +848,23 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
mutex_lock(&fprobe_mutex);
hlist_array = fp->hlist_array;
- ret = fprobe_graph_add_ips(addrs, num);
+ if (fprobe_is_ftrace(fp))
+ ret = fprobe_ftrace_add_ips(addrs, num);
+ else
+ ret = fprobe_graph_add_ips(addrs, num);
+
if (!ret) {
add_fprobe_hash(fp);
- for (i = 0; i < hlist_array->size; i++)
- insert_fprobe_node(&hlist_array->array[i]);
+ for (i = 0; i < hlist_array->size; i++) {
+ ret = insert_fprobe_node(&hlist_array->array[i]);
+ if (ret)
+ break;
+ }
+ /* fallback on insert error */
+ if (ret) {
+ for (i--; i >= 0; i--)
+ delete_fprobe_node(&hlist_array->array[i]);
+ }
}
mutex_unlock(&fprobe_mutex);
@@ -813,7 +948,10 @@ int unregister_fprobe(struct fprobe *fp)
}
del_fprobe_hash(fp);
- fprobe_graph_remove_ips(addrs, count);
+ if (fprobe_is_ftrace(fp))
+ fprobe_ftrace_remove_ips(addrs, count);
+ else
+ fprobe_graph_remove_ips(addrs, count);
kfree_rcu(hlist_array, rcu);
fp->hlist_array = NULL;
@@ -825,3 +963,10 @@ out:
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
+
+static int __init fprobe_initcall(void)
+{
+ rhltable_init(&fprobe_ip_table, &fprobe_rht_params);
+ return 0;
+}
+core_initcall(fprobe_initcall);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 59cfacb8a5bb..3ec2033c0774 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -534,7 +534,9 @@ static int function_stat_headers(struct seq_file *m)
static int function_stat_show(struct seq_file *m, void *v)
{
+ struct trace_array *tr = trace_get_global_array();
struct ftrace_profile *rec = v;
+ const char *refsymbol = NULL;
char str[KSYM_SYMBOL_LEN];
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static struct trace_seq s;
@@ -554,7 +556,29 @@ static int function_stat_show(struct seq_file *m, void *v)
return 0;
#endif
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) {
+ unsigned long offset;
+
+ if (core_kernel_text(rec->ip)) {
+ refsymbol = "_text";
+ offset = rec->ip - (unsigned long)_text;
+ } else {
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_text_address(rec->ip);
+ if (mod) {
+ refsymbol = mod->name;
+ /* Calculate offset from module's text entry address. */
+ offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base;
+ }
+ }
+ if (refsymbol)
+ snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset);
+ }
+ if (!refsymbol)
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -838,6 +862,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
return 1;
}
+bool fprofile_no_sleep_time;
+
static void profile_graph_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops,
struct ftrace_regs *fregs)
@@ -863,7 +889,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
calltime = rettime - profile_data->calltime;
- if (!fgraph_sleep_time) {
+ if (fprofile_no_sleep_time) {
if (current->ftrace_sleeptime)
calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
}
@@ -5951,7 +5977,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
del = __ftrace_lookup_ip(direct_functions, entry->ip);
- if (del && del->direct == addr) {
+ if (del && ftrace_jmp_get(del->direct) ==
+ ftrace_jmp_get(addr)) {
remove_hash_entry(direct_functions, del);
kfree(del);
}
@@ -6016,8 +6043,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
if (ftrace_hash_empty(hash))
return -EINVAL;
+ /* This is a "raw" address, and this should never happen. */
+ if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+ return -EINVAL;
+
mutex_lock(&direct_mutex);
+ if (ops->flags & FTRACE_OPS_FL_JMP)
+ addr = ftrace_jmp_set(addr);
+
/* Make sure requested entries are not already registered.. */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
@@ -6067,7 +6101,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
new_hash = NULL;
ops->func = call_direct_funcs;
- ops->flags = MULTI_FLAGS;
+ ops->flags |= MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
ops->direct_call = addr;
@@ -6138,6 +6172,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
lockdep_assert_held_once(&direct_mutex);
+ /* This is a "raw" address, and this should never happen. */
+ if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+ return -EINVAL;
+
+ if (ops->flags & FTRACE_OPS_FL_JMP)
+ addr = ftrace_jmp_set(addr);
+
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..dbee72d69d0a 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -3,6 +3,7 @@
* Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
*/
#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/irq_work.h>
#include <linux/slab.h>
#include "trace.h"
@@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
union upper_chunk *upper_chunk;
union lower_chunk *lower_chunk;
- unsigned long flags;
+ unsigned int seq;
unsigned int upper1;
unsigned int upper2;
unsigned int lower;
@@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
- upper_chunk = pid_list->upper[upper1];
- if (upper_chunk) {
- lower_chunk = upper_chunk->data[upper2];
- if (lower_chunk)
- ret = test_bit(lower, lower_chunk->data);
- }
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ do {
+ seq = read_seqcount_begin(&pid_list->seqcount);
+ ret = false;
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ } while (read_seqcount_retry(&pid_list->seqcount, seq));
return ret;
}
@@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
again:
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
@@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
}
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
/*
@@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
raw_spin_lock_init(&pid_list->lock);
+ seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..0b45fb0eadb9 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,6 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
+ seqcount_raw_spinlock_t seqcount;
raw_spinlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index afcd3747264d..41c9f5d079be 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -402,6 +402,41 @@ static void free_buffer_page(struct buffer_page *bpage)
}
/*
+ * For best performance, allocate cpu buffer data cache line sized
+ * and per CPU.
+ */
+#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \
+ kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+#define alloc_cpu_page(cpu) (struct buffer_page *) \
+ kzalloc_node(ALIGN(sizeof(struct buffer_page), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
+{
+ struct buffer_data_page *dpage;
+ struct page *page;
+ gfp_t mflags;
+
+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO;
+
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, order);
+ if (!page)
+ return NULL;
+
+ dpage = page_address(page);
+ rb_init_page(dpage);
+
+ return dpage;
+}
+
+/*
* We need to fit the time_stamp delta into 27 bits.
*/
static inline bool test_time_stamp(u64 delta)
@@ -1735,7 +1770,7 @@ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
bmeta->total_size = total_size;
bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
- /* Zero out the scatch pad */
+ /* Zero out the scratch pad */
memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
return false;
@@ -2204,7 +2239,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
- gfp_t mflags;
long i;
/*
@@ -2219,13 +2253,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
return -ENOMEM;
/*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
- mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
-
- /*
* If a user thread allocates too much, and si_mem_available()
* reports there's enough memory, even though there is not.
* Make sure the OOM killer kills this thread. This can happen
@@ -2241,10 +2268,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
for (i = 0; i < nr_pages; i++) {
- struct page *page;
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu_buffer->cpu));
+ bpage = alloc_cpu_page(cpu_buffer->cpu);
if (!bpage)
goto free_pages;
@@ -2267,13 +2292,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
bpage->range = 1;
bpage->id = i + 1;
} else {
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
+ if (!bpage->page)
goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
bpage->order = cpu_buffer->buffer->subbuf_order;
@@ -2324,14 +2346,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) =
+ alloc_cpu_buffer(cpu);
struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
- struct page *page;
int ret;
- cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
if (!cpu_buffer)
return NULL;
@@ -2347,8 +2367,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
mutex_init(&cpu_buffer->mapping_lock);
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
+ bpage = alloc_cpu_page(cpu);
if (!bpage)
return NULL;
@@ -2370,13 +2389,10 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
} else {
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu, order);
+ if (!bpage->page)
goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -6073,7 +6089,7 @@ static void rb_clear_buffer_page(struct buffer_page *page)
* id field, and updated via this function.
*
* But for a fixed memory mapped buffer, the id is already assigned for
- * fixed memory ording in the memory layout and can not be used. Instead
+ * fixed memory ordering in the memory layout and can not be used. Instead
* the index of where the page lies in the memory layout is used.
*
* For the normal pages, set the buffer page id with the passed in @id
@@ -6464,7 +6480,6 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
- struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
@@ -6486,22 +6501,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage->data)
- goto out;
-
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page) {
- kfree(bpage);
- return ERR_PTR(-ENOMEM);
+ if (bpage->data) {
+ rb_init_page(bpage->data);
+ } else {
+ bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order);
+ if (!bpage->data) {
+ kfree(bpage);
+ return ERR_PTR(-ENOMEM);
+ }
}
- bpage->data = page_address(page);
-
- out:
- rb_init_page(bpage->data);
-
return bpage;
}
EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
@@ -7660,7 +7669,7 @@ static __init int test_ringbuffer(void)
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
- * dropped and the thread wont catch it. But when a ring
+ * dropped and the thread won't catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index cdc3aea12c93..593e3b59e42e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void)
{
int ret;
- /* make a one meg buffer in overwite mode */
+ /* make a one meg buffer in overwrite mode */
buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
if (!buffer)
return -ENOMEM;
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 74c6bcc2c749..76537b8a4343 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,13 +13,9 @@
#include <linux/init.h>
#include <linux/rv.h>
-__printf(1, 2) static void rv_panic_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args)
{
- va_list args;
-
- va_start(args, msg);
vpanic(msg, args);
- va_end(args);
}
static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 2dae2916c05f..48c934e315b3 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,13 +12,9 @@
#include <linux/init.h>
#include <linux/rv.h>
-__printf(1, 2) static void rv_printk_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args)
{
- va_list args;
-
- va_start(args, msg);
vprintk_deferred(msg, args);
- va_end(args);
}
static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 43e9ea473cda..ee4e68102f17 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -375,15 +375,13 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
retval = rv_enable_monitor(mon);
else
retval = rv_disable_monitor(mon);
- mutex_unlock(&rv_interface_lock);
-
return retval ? : count;
}
@@ -422,35 +420,27 @@ static const struct file_operations interface_desc_fops = {
static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent)
{
struct dentry *root = parent ? parent->root_d : get_monitors_root();
- const char *name = mon->name;
+ struct dentry *dir __free(rv_remove) = rv_create_dir(mon->name, root);
struct dentry *tmp;
int retval;
- mon->root_d = rv_create_dir(name, root);
- if (!mon->root_d)
+ if (!dir)
return -ENOMEM;
- tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops);
- if (!tmp) {
- retval = -ENOMEM;
- goto out_remove_root;
- }
+ tmp = rv_create_file("enable", RV_MODE_WRITE, dir, mon, &interface_enable_fops);
+ if (!tmp)
+ return -ENOMEM;
- tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops);
- if (!tmp) {
- retval = -ENOMEM;
- goto out_remove_root;
- }
+ tmp = rv_create_file("desc", RV_MODE_READ, dir, mon, &interface_desc_fops);
+ if (!tmp)
+ return -ENOMEM;
- retval = reactor_populate_monitor(mon);
+ retval = reactor_populate_monitor(mon, dir);
if (retval)
- goto out_remove_root;
+ return retval;
+ mon->root_d = no_free_ptr(dir);
return 0;
-
-out_remove_root:
- rv_remove(mon->root_d);
- return retval;
}
/*
@@ -568,7 +558,7 @@ static void disable_all_monitors(void)
struct rv_monitor *mon;
int enabled = 0;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
list_for_each_entry(mon, &rv_monitors_list, list)
enabled += __rv_disable_monitor(mon, false);
@@ -581,8 +571,6 @@ static void disable_all_monitors(void)
*/
tracepoint_synchronize_unregister();
}
-
- mutex_unlock(&rv_interface_lock);
}
static int enabled_monitors_open(struct inode *inode, struct file *file)
@@ -623,7 +611,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
if (!len)
return count;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
retval = -EINVAL;
@@ -644,13 +632,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
else
retval = rv_disable_monitor(mon);
- if (!retval)
- retval = count;
-
- break;
+ if (retval)
+ return retval;
+ return count;
}
- mutex_unlock(&rv_interface_lock);
return retval;
}
@@ -737,7 +723,7 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
turn_monitoring_on_with_reset();
@@ -750,8 +736,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
*/
tracepoint_synchronize_unregister();
- mutex_unlock(&rv_interface_lock);
-
return count;
}
@@ -784,28 +768,26 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
return -EINVAL;
}
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
list_for_each_entry(r, &rv_monitors_list, list) {
if (strcmp(monitor->name, r->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
- retval = -EEXIST;
- goto out_unlock;
+ return -EEXIST;
}
}
if (parent && rv_is_nested_monitor(parent)) {
pr_info("Parent monitor %s is already nested, cannot nest further\n",
parent->name);
- retval = -EINVAL;
- goto out_unlock;
+ return -EINVAL;
}
monitor->parent = parent;
retval = create_monitor_dir(monitor, parent);
if (retval)
- goto out_unlock;
+ return retval;
/* keep children close to the parent for easier visualisation */
if (parent)
@@ -813,9 +795,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
else
list_add_tail(&monitor->list, &rv_monitors_list);
-out_unlock:
- mutex_unlock(&rv_interface_lock);
- return retval;
+ return 0;
}
/**
@@ -826,13 +806,12 @@ out_unlock:
*/
int rv_unregister_monitor(struct rv_monitor *monitor)
{
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
rv_disable_monitor(monitor);
list_del(&monitor->list);
destroy_monitor_dir(monitor);
- mutex_unlock(&rv_interface_lock);
return 0;
}
@@ -840,39 +819,36 @@ int __init rv_init_interface(void)
{
struct dentry *tmp;
int retval;
+ struct dentry *root_dir __free(rv_remove) = rv_create_dir("rv", NULL);
- rv_root.root_dir = rv_create_dir("rv", NULL);
- if (!rv_root.root_dir)
- goto out_err;
+ if (!root_dir)
+ return 1;
- rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+ rv_root.monitors_dir = rv_create_dir("monitors", root_dir);
if (!rv_root.monitors_dir)
- goto out_err;
+ return 1;
- tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+ tmp = rv_create_file("available_monitors", RV_MODE_READ, root_dir, NULL,
&available_monitors_ops);
if (!tmp)
- goto out_err;
+ return 1;
- tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, root_dir, NULL,
&enabled_monitors_ops);
if (!tmp)
- goto out_err;
+ return 1;
- tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, root_dir, NULL,
&monitoring_on_fops);
if (!tmp)
- goto out_err;
- retval = init_rv_reactors(rv_root.root_dir);
+ return 1;
+ retval = init_rv_reactors(root_dir);
if (retval)
- goto out_err;
+ return 1;
turn_monitoring_on();
- return 0;
+ rv_root.root_dir = no_free_ptr(root_dir);
-out_err:
- rv_remove(rv_root.root_dir);
- printk(KERN_ERR "RV: Error while creating the RV interface\n");
- return 1;
+ return 0;
}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 1485a70c1bf4..2c0f51ff9d5c 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -17,6 +17,8 @@ struct rv_interface {
#define rv_create_file tracefs_create_file
#define rv_remove tracefs_remove
+DEFINE_FREE(rv_remove, struct dentry *, if (_T) rv_remove(_T));
+
#define MAX_RV_MONITOR_NAME_SIZE 32
#define MAX_RV_REACTOR_NAME_SIZE 32
@@ -30,10 +32,10 @@ bool rv_is_container_monitor(struct rv_monitor *mon);
bool rv_is_nested_monitor(struct rv_monitor *mon);
#ifdef CONFIG_RV_REACTORS
-int reactor_populate_monitor(struct rv_monitor *mon);
+int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root);
int init_rv_reactors(struct dentry *root_dir);
#else
-static inline int reactor_populate_monitor(struct rv_monitor *mon)
+static inline int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root)
{
return 0;
}
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index d32859fec238..460af07f7aba 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -61,6 +61,7 @@
* printk
*/
+#include <linux/lockdep.h>
#include <linux/slab.h>
#include "rv.h"
@@ -232,9 +233,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
seq_f = file->private_data;
mon = seq_f->private;
- mutex_lock(&rv_interface_lock);
-
- retval = -EINVAL;
+ guard(mutex)(&rv_interface_lock);
list_for_each_entry(reactor, &rv_reactors_list, list) {
if (strcmp(ptr, reactor->name) != 0)
@@ -242,13 +241,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
monitor_swap_reactors(mon, reactor);
- retval = count;
- break;
+ return count;
}
- mutex_unlock(&rv_interface_lock);
-
- return retval;
+ return -EINVAL;
}
/*
@@ -309,18 +305,14 @@ static int __rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_register_reactor(struct rv_reactor *reactor)
{
- int retval = 0;
-
if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
pr_info("Reactor %s has a name longer than %d\n",
reactor->name, MAX_RV_MONITOR_NAME_SIZE);
return -EINVAL;
}
- mutex_lock(&rv_interface_lock);
- retval = __rv_register_reactor(reactor);
- mutex_unlock(&rv_interface_lock);
- return retval;
+ guard(mutex)(&rv_interface_lock);
+ return __rv_register_reactor(reactor);
}
/**
@@ -331,9 +323,8 @@ int rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_unregister_reactor(struct rv_reactor *reactor)
{
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
list_del(&reactor->list);
- mutex_unlock(&rv_interface_lock);
return 0;
}
@@ -347,7 +338,7 @@ static bool __read_mostly reacting_on;
*
* Returns 1 if on, 0 otherwise.
*/
-bool rv_reacting_on(void)
+static bool rv_reacting_on(void)
{
/* Ensures that concurrent monitors read consistent reacting_on */
smp_rmb();
@@ -389,7 +380,7 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
if (retval)
return retval;
- mutex_lock(&rv_interface_lock);
+ guard(mutex)(&rv_interface_lock);
if (val)
turn_reacting_on();
@@ -402,8 +393,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
*/
tracepoint_synchronize_unregister();
- mutex_unlock(&rv_interface_lock);
-
return count;
}
@@ -416,14 +405,15 @@ static const struct file_operations reacting_on_fops = {
/**
* reactor_populate_monitor - creates per monitor reactors file
* @mon: The monitor.
+ * @root: The directory of the monitor.
*
* Returns 0 if successful, error otherwise.
*/
-int reactor_populate_monitor(struct rv_monitor *mon)
+int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root)
{
struct dentry *tmp;
- tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops);
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, root, mon, &monitor_reactors_ops);
if (!tmp)
return -ENOMEM;
@@ -438,7 +428,7 @@ int reactor_populate_monitor(struct rv_monitor *mon)
/*
* Nop reactor register
*/
-__printf(1, 2) static void rv_nop_reaction(const char *msg, ...)
+__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args)
{
}
@@ -450,30 +440,42 @@ static struct rv_reactor rv_nop = {
int init_rv_reactors(struct dentry *root_dir)
{
- struct dentry *available, *reacting;
int retval;
- available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
- &available_reactors_ops);
- if (!available)
- goto out_err;
+ struct dentry *available __free(rv_remove) =
+ rv_create_file("available_reactors", RV_MODE_READ, root_dir,
+ NULL, &available_reactors_ops);
+
+ struct dentry *reacting __free(rv_remove) =
+ rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
- reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
- if (!reacting)
- goto rm_available;
+ if (!reacting || !available)
+ return -ENOMEM;
retval = __rv_register_reactor(&rv_nop);
if (retval)
- goto rm_reacting;
+ return retval;
turn_reacting_on();
+ retain_and_null_ptr(available);
+ retain_and_null_ptr(reacting);
return 0;
+}
+
+void rv_react(struct rv_monitor *monitor, const char *msg, ...)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE);
+ va_list args;
+
+ if (!rv_reacting_on() || !monitor->react)
+ return;
+
+ va_start(args, msg);
+
+ lock_map_acquire_try(&rv_react_map);
+ monitor->react(msg, args);
+ lock_map_release(&rv_react_map);
-rm_reacting:
- rv_remove(reacting);
-rm_available:
- rv_remove(available);
-out_err:
- return -ENOMEM;
+ va_end(args);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 304e93597126..e575956ef9b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/irqflags.h>
+#include <linux/syscalls.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
#include <linux/pagemap.h>
@@ -93,17 +94,13 @@ static bool tracepoint_printk_stop_on_boot __initdata;
static bool traceoff_after_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
-/* For tracers that don't implement custom flags */
-static struct tracer_opt dummy_tracer_opt[] = {
- { }
+/* Store tracers and their flags per instance */
+struct tracers {
+ struct list_head list;
+ struct tracer *tracer;
+ struct tracer_flags *flags;
};
-static int
-dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return 0;
-}
-
/*
* To prevent the comm cache from being overwritten when no
* tracing is active, only save the comm when a trace event
@@ -128,7 +125,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* If there is an oops (or kernel panic) and the ftrace_dump_on_oops
* is set, then ftrace_dump is called. This will output the contents
* of the ftrace buffers to the console. This is very useful for
- * capturing traces that lead to crashes and outputing it to a
+ * capturing traces that lead to crashes and outputting it to a
* serial console.
*
* It is default off, but you can enable it with either specifying
@@ -137,7 +134,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
* Set instance name if you want to dump the specific trace instance
- * Multiple instance dump is also supported, and instances are seperated
+ * Multiple instance dump is also supported, and instances are separated
* by commas.
*/
/* Set to string format zero to disable by default */
@@ -512,22 +509,23 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS \
- (FUNCTION_DEFAULT_FLAGS | \
- TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
- TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
- TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \
- TRACE_ITER_COPY_MARKER)
+ (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \
+ TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \
+ TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \
+ TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \
+ TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \
+ TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \
+ TRACE_ITER(COPY_MARKER))
/* trace_options that are only supported by global_trace */
-#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
- TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
+#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \
+ TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \
+ TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS)
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \
- TRACE_ITER_COPY_MARKER)
+ (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \
+ TRACE_ITER(COPY_MARKER))
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -558,9 +556,9 @@ static void update_printk_trace(struct trace_array *tr)
if (printk_trace == tr)
return;
- printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK);
printk_trace = tr;
- tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+ tr->trace_flags |= TRACE_ITER(TRACE_PRINTK);
}
/* Returns true if the status of tr changed */
@@ -573,7 +571,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
return false;
list_add_rcu(&tr->marker_list, &marker_copies);
- tr->trace_flags |= TRACE_ITER_COPY_MARKER;
+ tr->trace_flags |= TRACE_ITER(COPY_MARKER);
return true;
}
@@ -581,7 +579,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled)
return false;
list_del_init(&tr->marker_list);
- tr->trace_flags &= ~TRACE_ITER_COPY_MARKER;
+ tr->trace_flags &= ~TRACE_ITER(COPY_MARKER);
return true;
}
@@ -1139,7 +1137,7 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
unsigned int trace_ctx;
int alloc;
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
return 0;
if (unlikely(tracing_selftest_running && tr == &global_trace))
@@ -1205,7 +1203,7 @@ int __trace_bputs(unsigned long ip, const char *str)
if (!printk_binsafe(tr))
return __trace_puts(ip, str, strlen(str));
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -2173,6 +2171,7 @@ static int save_selftest(struct tracer *type)
static int run_tracer_selftest(struct tracer *type)
{
struct trace_array *tr = &global_trace;
+ struct tracer_flags *saved_flags = tr->current_trace_flags;
struct tracer *saved_tracer = tr->current_trace;
int ret;
@@ -2203,6 +2202,7 @@ static int run_tracer_selftest(struct tracer *type)
tracing_reset_online_cpus(&tr->array_buffer);
tr->current_trace = type;
+ tr->current_trace_flags = type->flags ? : type->default_flags;
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
@@ -2219,6 +2219,7 @@ static int run_tracer_selftest(struct tracer *type)
ret = type->selftest(type, tr);
/* the test is responsible for resetting too */
tr->current_trace = saved_tracer;
+ tr->current_trace_flags = saved_flags;
if (ret) {
printk(KERN_CONT "FAILED!\n");
/* Add the warning after printing 'FAILED' */
@@ -2311,10 +2312,23 @@ static inline int do_run_tracer_selftest(struct tracer *type)
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
-static void add_tracer_options(struct trace_array *tr, struct tracer *t);
+static int add_tracer(struct trace_array *tr, struct tracer *t);
static void __init apply_trace_boot_options(void);
+static void free_tracers(struct trace_array *tr)
+{
+ struct tracers *t, *n;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ list_for_each_entry_safe(t, n, &tr->tracers, list) {
+ list_del(&t->list);
+ kfree(t->flags);
+ kfree(t);
+ }
+}
+
/**
* register_tracer - register a tracer with the ftrace system.
* @type: the plugin for the tracer
@@ -2323,6 +2337,7 @@ static void __init apply_trace_boot_options(void);
*/
int __init register_tracer(struct tracer *type)
{
+ struct trace_array *tr;
struct tracer *t;
int ret = 0;
@@ -2354,31 +2369,25 @@ int __init register_tracer(struct tracer *type)
}
}
- if (!type->set_flag)
- type->set_flag = &dummy_set_flag;
- if (!type->flags) {
- /*allocate a dummy tracer_flags*/
- type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
- if (!type->flags) {
- ret = -ENOMEM;
- goto out;
- }
- type->flags->val = 0;
- type->flags->opts = dummy_tracer_opt;
- } else
- if (!type->flags->opts)
- type->flags->opts = dummy_tracer_opt;
-
/* store the tracer for __set_tracer_option */
- type->flags->trace = type;
+ if (type->flags)
+ type->flags->trace = type;
ret = do_run_tracer_selftest(type);
if (ret < 0)
goto out;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ ret = add_tracer(tr, type);
+ if (ret < 0) {
+ /* The tracer will still exist but without options */
+ pr_warn("Failed to create tracer options for %s\n", type->name);
+ break;
+ }
+ }
+
type->next = trace_types;
trace_types = type;
- add_tracer_options(&global_trace, type);
out:
mutex_unlock(&trace_types_lock);
@@ -2391,7 +2400,7 @@ int __init register_tracer(struct tracer *type)
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
- tracing_set_tracer(&global_trace, type->name);
+ WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0);
default_bootup_tracer = NULL;
apply_trace_boot_options();
@@ -3078,7 +3087,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
- if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER(STACKTRACE)))
return;
__ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
@@ -3139,7 +3148,7 @@ ftrace_trace_userstack(struct trace_array *tr,
struct ring_buffer_event *event;
struct userstack_entry *entry;
- if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE)))
return;
/*
@@ -3484,7 +3493,7 @@ int trace_array_printk(struct trace_array *tr,
if (tr == &global_trace)
return 0;
- if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
return 0;
va_start(ap, fmt);
@@ -3521,7 +3530,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
int ret;
va_list ap;
- if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK)))
return 0;
va_start(ap, fmt);
@@ -3791,7 +3800,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
if (WARN_ON_ONCE(!fmt))
return fmt;
- if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR))
return fmt;
p = fmt;
@@ -4113,7 +4122,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
- bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ bool tgid = flags & TRACE_ITER(RECORD_TGID);
print_event_info(buf, m);
@@ -4124,7 +4133,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
- bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ bool tgid = flags & TRACE_ITER(RECORD_TGID);
static const char space[] = " ";
int prec = tgid ? 12 : 2;
@@ -4197,7 +4206,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_array *tr = iter->tr;
- if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
+ if (!(tr->trace_flags & TRACE_ITER(ANNOTATE)))
return;
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
@@ -4219,6 +4228,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
iter->cpu);
}
+#ifdef CONFIG_FTRACE_SYSCALLS
+static bool is_syscall_event(struct trace_event *event)
+{
+ return (event->funcs == &enter_syscall_print_funcs) ||
+ (event->funcs == &exit_syscall_print_funcs);
+
+}
+#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT
+#else
+static inline bool is_syscall_event(struct trace_event *event)
+{
+ return false;
+}
+#define syscall_buf_size 0
+#endif /* CONFIG_FTRACE_SYSCALLS */
+
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
struct trace_array *tr = iter->tr;
@@ -4233,7 +4258,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
trace_print_lat_context(iter);
else
@@ -4244,17 +4269,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
return TRACE_TYPE_PARTIAL_LINE;
if (event) {
- if (tr->trace_flags & TRACE_ITER_FIELDS)
+ if (tr->trace_flags & TRACE_ITER(FIELDS))
return print_event_fields(iter, event);
/*
* For TRACE_EVENT() events, the print_fmt is not
* safe to use if the array has delta offsets
* Force printing via the fields.
*/
- if ((tr->text_delta) &&
- event->type > __TRACE_LAST_TYPE)
+ if ((tr->text_delta)) {
+ /* ftrace and system call events are still OK */
+ if ((event->type > __TRACE_LAST_TYPE) &&
+ !is_syscall_event(event))
return print_event_fields(iter, event);
-
+ }
return event->funcs->trace(iter, sym_flags, event);
}
@@ -4272,7 +4299,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO))
trace_seq_printf(s, "%d %d %llu ",
entry->pid, iter->cpu, iter->ts);
@@ -4298,7 +4325,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
SEQ_PUT_HEX_FIELD(s, entry->pid);
SEQ_PUT_HEX_FIELD(s, iter->cpu);
SEQ_PUT_HEX_FIELD(s, iter->ts);
@@ -4327,7 +4354,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
SEQ_PUT_FIELD(s, entry->pid);
SEQ_PUT_FIELD(s, iter->cpu);
SEQ_PUT_FIELD(s, iter->ts);
@@ -4398,27 +4425,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
}
if (iter->ent->type == TRACE_BPUTS &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_bputs_msg_only(iter);
if (iter->ent->type == TRACE_BPRINT &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_bprintk_msg_only(iter);
if (iter->ent->type == TRACE_PRINT &&
- trace_flags & TRACE_ITER_PRINTK &&
- trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ trace_flags & TRACE_ITER(PRINTK) &&
+ trace_flags & TRACE_ITER(PRINTK_MSGONLY))
return trace_print_printk_msg_only(iter);
- if (trace_flags & TRACE_ITER_BIN)
+ if (trace_flags & TRACE_ITER(BIN))
return print_bin_fmt(iter);
- if (trace_flags & TRACE_ITER_HEX)
+ if (trace_flags & TRACE_ITER(HEX))
return print_hex_fmt(iter);
- if (trace_flags & TRACE_ITER_RAW)
+ if (trace_flags & TRACE_ITER(RAW))
return print_raw_fmt(iter);
return print_trace_fmt(iter);
@@ -4436,7 +4463,7 @@ void trace_latency_header(struct seq_file *m)
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
print_trace_header(m, iter);
- if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
+ if (!(tr->trace_flags & TRACE_ITER(VERBOSE)))
print_lat_help_header(m);
}
@@ -4446,7 +4473,7 @@ void trace_default_header(struct seq_file *m)
struct trace_array *tr = iter->tr;
unsigned long trace_flags = tr->trace_flags;
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
@@ -4454,11 +4481,11 @@ void trace_default_header(struct seq_file *m)
if (trace_empty(iter))
return;
print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
+ if (!(trace_flags & TRACE_ITER(VERBOSE)))
print_lat_help_header(m);
} else {
- if (!(trace_flags & TRACE_ITER_VERBOSE)) {
- if (trace_flags & TRACE_ITER_IRQ_INFO)
+ if (!(trace_flags & TRACE_ITER(VERBOSE))) {
+ if (trace_flags & TRACE_ITER(IRQ_INFO))
print_func_help_header_irq(iter->array_buffer,
m, trace_flags);
else
@@ -4682,8 +4709,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
* If pause-on-trace is enabled, then stop the trace while
* dumping, unless this is the "snapshot" file
*/
- if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
+ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) {
+ iter->iter_flags |= TRACE_FILE_PAUSE;
tracing_stop_tr(tr);
+ }
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
@@ -4815,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- if (!iter->snapshot && tr->stop_count)
+ if (iter->iter_flags & TRACE_FILE_PAUSE)
/* reenable tracing if it was previously enabled */
tracing_start_tr(tr);
@@ -4876,7 +4905,7 @@ static int tracing_open(struct inode *inode, struct file *file)
iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
- else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
@@ -5139,21 +5168,26 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
{
struct tracer_opt *trace_opts;
struct trace_array *tr = m->private;
+ struct tracer_flags *flags;
u32 tracer_flags;
int i;
guard(mutex)(&trace_types_lock);
- tracer_flags = tr->current_trace->flags->val;
- trace_opts = tr->current_trace->flags->opts;
-
for (i = 0; trace_options[i]; i++) {
- if (tr->trace_flags & (1 << i))
+ if (tr->trace_flags & (1ULL << i))
seq_printf(m, "%s\n", trace_options[i]);
else
seq_printf(m, "no%s\n", trace_options[i]);
}
+ flags = tr->current_trace_flags;
+ if (!flags || !flags->opts)
+ return 0;
+
+ tracer_flags = flags->val;
+ trace_opts = flags->opts;
+
for (i = 0; trace_opts[i].name; i++) {
if (tracer_flags & trace_opts[i].bit)
seq_printf(m, "%s\n", trace_opts[i].name);
@@ -5169,9 +5203,10 @@ static int __set_tracer_option(struct trace_array *tr,
struct tracer_opt *opts, int neg)
{
struct tracer *trace = tracer_flags->trace;
- int ret;
+ int ret = 0;
- ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
+ if (trace->set_flag)
+ ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
if (ret)
return ret;
@@ -5185,37 +5220,41 @@ static int __set_tracer_option(struct trace_array *tr,
/* Try to assign a tracer specific option */
static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
- struct tracer *trace = tr->current_trace;
- struct tracer_flags *tracer_flags = trace->flags;
+ struct tracer_flags *tracer_flags = tr->current_trace_flags;
struct tracer_opt *opts = NULL;
int i;
+ if (!tracer_flags || !tracer_flags->opts)
+ return 0;
+
for (i = 0; tracer_flags->opts[i].name; i++) {
opts = &tracer_flags->opts[i];
if (strcmp(cmp, opts->name) == 0)
- return __set_tracer_option(tr, trace->flags, opts, neg);
+ return __set_tracer_option(tr, tracer_flags, opts, neg);
}
return -EINVAL;
}
/* Some tracers require overwrite to stay enabled */
-int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set)
{
- if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set)
return -1;
return 0;
}
-int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
+int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled)
{
- if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD) ||
- (mask == TRACE_ITER_TRACE_PRINTK) ||
- (mask == TRACE_ITER_COPY_MARKER))
+ switch (mask) {
+ case TRACE_ITER(RECORD_TGID):
+ case TRACE_ITER(RECORD_CMD):
+ case TRACE_ITER(TRACE_PRINTK):
+ case TRACE_ITER(COPY_MARKER):
lockdep_assert_held(&event_mutex);
+ }
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
@@ -5226,7 +5265,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
- if (mask == TRACE_ITER_TRACE_PRINTK) {
+ switch (mask) {
+ case TRACE_ITER(TRACE_PRINTK):
if (enabled) {
update_printk_trace(tr);
} else {
@@ -5238,50 +5278,64 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return -EINVAL;
/*
* An instance must always have it set.
- * by default, that's the global_trace instane.
+ * by default, that's the global_trace instance.
*/
if (printk_trace == tr)
update_printk_trace(&global_trace);
}
- }
+ break;
- if (mask == TRACE_ITER_COPY_MARKER)
+ case TRACE_ITER(COPY_MARKER):
update_marker_trace(tr, enabled);
+ /* update_marker_trace updates the tr->trace_flags */
+ return 0;
+ }
if (enabled)
tr->trace_flags |= mask;
else
tr->trace_flags &= ~mask;
- if (mask == TRACE_ITER_RECORD_CMD)
+ switch (mask) {
+ case TRACE_ITER(RECORD_CMD):
trace_event_enable_cmd_record(enabled);
+ break;
- if (mask == TRACE_ITER_RECORD_TGID) {
+ case TRACE_ITER(RECORD_TGID):
if (trace_alloc_tgid_map() < 0) {
- tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+ tr->trace_flags &= ~TRACE_ITER(RECORD_TGID);
return -ENOMEM;
}
trace_event_enable_tgid_record(enabled);
- }
+ break;
- if (mask == TRACE_ITER_EVENT_FORK)
+ case TRACE_ITER(EVENT_FORK):
trace_event_follow_fork(tr, enabled);
+ break;
- if (mask == TRACE_ITER_FUNC_FORK)
+ case TRACE_ITER(FUNC_FORK):
ftrace_pid_follow_fork(tr, enabled);
+ break;
- if (mask == TRACE_ITER_OVERWRITE) {
+ case TRACE_ITER(OVERWRITE):
ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
#endif
- }
+ break;
- if (mask == TRACE_ITER_PRINTK) {
+ case TRACE_ITER(PRINTK):
trace_printk_start_stop_comm(enabled);
trace_printk_control(enabled);
+ break;
+
+#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER)
+ case TRACE_GRAPH_GRAPH_TIME:
+ ftrace_graph_graph_time_control(enabled);
+ break;
+#endif
}
return 0;
@@ -5311,7 +5365,7 @@ int trace_set_options(struct trace_array *tr, char *option)
if (ret < 0)
ret = set_tracer_option(tr, cmp, neg);
else
- ret = set_tracer_flag(tr, 1 << ret, !neg);
+ ret = set_tracer_flag(tr, 1ULL << ret, !neg);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -6215,11 +6269,6 @@ int tracing_update_buffers(struct trace_array *tr)
return ret;
}
-struct trace_option_dentry;
-
-static void
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
-
/*
* Used to clear out the tracer before deletion of an instance.
* Must have trace_types_lock held.
@@ -6235,26 +6284,15 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace->reset(tr);
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
}
static bool tracer_options_updated;
-static void add_tracer_options(struct trace_array *tr, struct tracer *t)
-{
- /* Only enable if the directory has been created already. */
- if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
- return;
-
- /* Only create trace option files after update_tracer_options finish */
- if (!tracer_options_updated)
- return;
-
- create_trace_option_files(tr, t);
-}
-
int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
- struct tracer *t;
+ struct tracer *trace = NULL;
+ struct tracers *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
#endif
@@ -6272,18 +6310,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
ret = 0;
}
- for (t = trace_types; t; t = t->next) {
- if (strcmp(t->name, buf) == 0)
+ list_for_each_entry(t, &tr->tracers, list) {
+ if (strcmp(t->tracer->name, buf) == 0) {
+ trace = t->tracer;
break;
+ }
}
- if (!t)
+ if (!trace)
return -EINVAL;
- if (t == tr->current_trace)
+ if (trace == tr->current_trace)
return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
- if (t->use_max_tr) {
+ if (trace->use_max_tr) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
ret = tr->cond_snapshot ? -EBUSY : 0;
@@ -6294,14 +6334,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
#endif
/* Some tracers won't work on kernel command line */
- if (system_state < SYSTEM_RUNNING && t->noboot) {
+ if (system_state < SYSTEM_RUNNING && trace->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
- t->name);
+ trace->name);
return -EINVAL;
}
/* Some tracers are only allowed for the top level buffer */
- if (!trace_ok_for_array(t, tr))
+ if (!trace_ok_for_array(trace, tr))
return -EINVAL;
/* If trace pipe files are being read, we can't change the tracer */
@@ -6320,8 +6360,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
- if (had_max_tr && !t->use_max_tr) {
+ if (had_max_tr && !trace->use_max_tr) {
/*
* We need to make sure that the update_max_tr sees that
* current_trace changed to nop_trace to keep it from
@@ -6334,7 +6375,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
tracing_disarm_snapshot(tr);
}
- if (!had_max_tr && t->use_max_tr) {
+ if (!had_max_tr && trace->use_max_tr) {
ret = tracing_arm_snapshot_locked(tr);
if (ret)
return ret;
@@ -6343,18 +6384,21 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
tr->current_trace = &nop_trace;
#endif
- if (t->init) {
- ret = tracer_init(t, tr);
+ tr->current_trace_flags = t->flags ? : t->tracer->flags;
+
+ if (trace->init) {
+ ret = tracer_init(trace, tr);
if (ret) {
#ifdef CONFIG_TRACER_MAX_TRACE
- if (t->use_max_tr)
+ if (trace->use_max_tr)
tracing_disarm_snapshot(tr);
#endif
+ tr->current_trace_flags = nop_trace.flags;
return ret;
}
}
- tr->current_trace = t;
+ tr->current_trace = trace;
tr->current_trace->enabled++;
trace_branch_enable(tr);
@@ -6532,7 +6576,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
iter->iter_flags |= TRACE_FILE_LAT_FMT;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -6593,7 +6637,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
if (trace_buffer_iter(iter, iter->cpu_file))
return EPOLLIN | EPOLLRDNORM;
- if (tr->trace_flags & TRACE_ITER_BLOCK)
+ if (tr->trace_flags & TRACE_ITER(BLOCK))
/*
* Always select as readable when in blocking mode
*/
@@ -6912,6 +6956,43 @@ out_err:
}
static ssize_t
+tracing_syscall_buf_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_syscall_buf_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (val > SYSCALL_FAULT_USER_MAX)
+ val = SYSCALL_FAULT_USER_MAX;
+
+ tr->syscall_buf_sz = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7145,7 +7226,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
/* disable tracing ? */
- if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
+ if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE))
tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
@@ -7223,52 +7304,43 @@ struct trace_user_buf {
char *buf;
};
-struct trace_user_buf_info {
- struct trace_user_buf __percpu *tbuf;
- int ref;
-};
-
-
static DEFINE_MUTEX(trace_user_buffer_mutex);
static struct trace_user_buf_info *trace_user_buffer;
-static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo)
+/**
+ * trace_user_fault_destroy - free up allocated memory of a trace user buffer
+ * @tinfo: The descriptor to free up
+ *
+ * Frees any data allocated in the trace info dsecriptor.
+ */
+void trace_user_fault_destroy(struct trace_user_buf_info *tinfo)
{
char *buf;
int cpu;
+ if (!tinfo || !tinfo->tbuf)
+ return;
+
for_each_possible_cpu(cpu) {
buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
kfree(buf);
}
free_percpu(tinfo->tbuf);
- kfree(tinfo);
}
-static int trace_user_fault_buffer_enable(void)
+static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size)
{
- struct trace_user_buf_info *tinfo;
char *buf;
int cpu;
- guard(mutex)(&trace_user_buffer_mutex);
-
- if (trace_user_buffer) {
- trace_user_buffer->ref++;
- return 0;
- }
-
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- return -ENOMEM;
+ lockdep_assert_held(&trace_user_buffer_mutex);
tinfo->tbuf = alloc_percpu(struct trace_user_buf);
- if (!tinfo->tbuf) {
- kfree(tinfo);
+ if (!tinfo->tbuf)
return -ENOMEM;
- }
tinfo->ref = 1;
+ tinfo->size = size;
/* Clear each buffer in case of error */
for_each_possible_cpu(cpu) {
@@ -7276,42 +7348,165 @@ static int trace_user_fault_buffer_enable(void)
}
for_each_possible_cpu(cpu) {
- buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL,
+ buf = kmalloc_node(size, GFP_KERNEL,
cpu_to_node(cpu));
- if (!buf) {
- trace_user_fault_buffer_free(tinfo);
+ if (!buf)
return -ENOMEM;
- }
per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf;
}
- trace_user_buffer = tinfo;
-
return 0;
}
-static void trace_user_fault_buffer_disable(void)
+/* For internal use. Free and reinitialize */
+static void user_buffer_free(struct trace_user_buf_info **tinfo)
{
- struct trace_user_buf_info *tinfo;
+ lockdep_assert_held(&trace_user_buffer_mutex);
- guard(mutex)(&trace_user_buffer_mutex);
+ trace_user_fault_destroy(*tinfo);
+ kfree(*tinfo);
+ *tinfo = NULL;
+}
+
+/* For internal use. Initialize and allocate */
+static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size)
+{
+ bool alloc = false;
+ int ret;
+
+ lockdep_assert_held(&trace_user_buffer_mutex);
- tinfo = trace_user_buffer;
+ if (!*tinfo) {
+ alloc = true;
+ *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL);
+ if (!*tinfo)
+ return -ENOMEM;
+ }
+
+ ret = user_fault_buffer_enable(*tinfo, size);
+ if (ret < 0 && alloc)
+ user_buffer_free(tinfo);
+
+ return ret;
+}
- if (WARN_ON_ONCE(!tinfo))
+/* For internal use, derefrence and free if necessary */
+static void user_buffer_put(struct trace_user_buf_info **tinfo)
+{
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref))
return;
- if (--tinfo->ref)
+ if (--(*tinfo)->ref)
return;
- trace_user_fault_buffer_free(tinfo);
- trace_user_buffer = NULL;
+ user_buffer_free(tinfo);
}
-/* Must be called with preemption disabled */
-static char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
- const char __user *ptr, size_t size,
- size_t *read_size)
+/**
+ * trace_user_fault_init - Allocated or reference a per CPU buffer
+ * @tinfo: A pointer to the trace buffer descriptor
+ * @size: The size to allocate each per CPU buffer
+ *
+ * Create a per CPU buffer that can be used to copy from user space
+ * in a task context. When calling trace_user_fault_read(), preemption
+ * must be disabled, and it will enable preemption and copy user
+ * space data to the buffer. If any schedule switches occur, it will
+ * retry until it succeeds without a schedule switch knowing the buffer
+ * is still valid.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size)
+{
+ int ret;
+
+ if (!tinfo)
+ return -EINVAL;
+
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ ret = user_buffer_init(&tinfo, size);
+ if (ret < 0)
+ trace_user_fault_destroy(tinfo);
+
+ return ret;
+}
+
+/**
+ * trace_user_fault_get - up the ref count for the user buffer
+ * @tinfo: A pointer to a pointer to the trace buffer descriptor
+ *
+ * Ups the ref count of the trace buffer.
+ *
+ * Returns the new ref count.
+ */
+int trace_user_fault_get(struct trace_user_buf_info *tinfo)
+{
+ if (!tinfo)
+ return -1;
+
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ tinfo->ref++;
+ return tinfo->ref;
+}
+
+/**
+ * trace_user_fault_put - dereference a per cpu trace buffer
+ * @tinfo: The @tinfo that was passed to trace_user_fault_get()
+ *
+ * Decrement the ref count of @tinfo.
+ *
+ * Returns the new refcount (negative on error).
+ */
+int trace_user_fault_put(struct trace_user_buf_info *tinfo)
+{
+ guard(mutex)(&trace_user_buffer_mutex);
+
+ if (WARN_ON_ONCE(!tinfo || !tinfo->ref))
+ return -1;
+
+ --tinfo->ref;
+ return tinfo->ref;
+}
+
+/**
+ * trace_user_fault_read - Read user space into a per CPU buffer
+ * @tinfo: The @tinfo allocated by trace_user_fault_get()
+ * @ptr: The user space pointer to read
+ * @size: The size of user space to read.
+ * @copy_func: Optional function to use to copy from user space
+ * @data: Data to pass to copy_func if it was supplied
+ *
+ * Preemption must be disabled when this is called, and must not
+ * be enabled while using the returned buffer.
+ * This does the copying from user space into a per CPU buffer.
+ *
+ * The @size must not be greater than the size passed in to
+ * trace_user_fault_init().
+ *
+ * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(),
+ * otherwise it will call @copy_func. It will call @copy_func with:
+ *
+ * buffer: the per CPU buffer of the @tinfo.
+ * ptr: The pointer @ptr to user space to read
+ * size: The @size of the ptr to read
+ * data: The @data parameter
+ *
+ * It is expected that @copy_func will return 0 on success and non zero
+ * if there was a fault.
+ *
+ * Returns a pointer to the buffer with the content read from @ptr.
+ * Preemption must remain disabled while the caller accesses the
+ * buffer returned by this function.
+ * Returns NULL if there was a fault, or the size passed in is
+ * greater than the size passed to trace_user_fault_init().
+ */
+char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
+ const char __user *ptr, size_t size,
+ trace_user_buf_copy copy_func, void *data)
{
int cpu = smp_processor_id();
char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
@@ -7319,9 +7514,14 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
int trys = 0;
int ret;
- if (size > TRACE_MARKER_MAX_SIZE)
- size = TRACE_MARKER_MAX_SIZE;
- *read_size = 0;
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * It's up to the caller to not try to copy more than it said
+ * it would.
+ */
+ if (size > tinfo->size)
+ return NULL;
/*
* This acts similar to a seqcount. The per CPU context switches are
@@ -7356,12 +7556,19 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
migrate_disable();
/*
- * Now preemption is being enabed and another task can come in
+ * Now preemption is being enabled and another task can come in
* and use the same buffer and corrupt our data.
*/
preempt_enable_notrace();
- ret = __copy_from_user(buffer, ptr, size);
+ /* Make sure preemption is enabled here */
+ lockdep_assert_preemption_enabled();
+
+ if (copy_func) {
+ ret = copy_func(buffer, ptr, size, data);
+ } else {
+ ret = __copy_from_user(buffer, ptr, size);
+ }
preempt_disable_notrace();
migrate_enable();
@@ -7378,7 +7585,6 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
*/
} while (nr_context_switches_cpu(cpu) != cnt);
- *read_size = size;
return buffer;
}
@@ -7389,13 +7595,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
ssize_t written = -ENODEV;
unsigned long ip;
- size_t size;
char *buf;
if (tracing_disabled)
return -EINVAL;
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
return -EINVAL;
if ((ssize_t)cnt < 0)
@@ -7407,13 +7612,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
/* Must have preemption disabled while having access to the buffer */
guard(preempt_notrace)();
- buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size);
+ buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL);
if (!buf)
return -EFAULT;
- if (cnt > size)
- cnt = size;
-
/* The selftests expect this function to be the IP address */
ip = _THIS_IP_;
@@ -7442,7 +7644,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
size_t size;
/* cnt includes both the entry->id and the data behind it. */
- size = struct_size(entry, buf, cnt - sizeof(entry->id));
+ size = struct_offset(entry, id) + cnt;
buffer = tr->array_buffer.buffer;
@@ -7473,30 +7675,29 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
ssize_t written = -ENODEV;
- size_t size;
char *buf;
if (tracing_disabled)
return -EINVAL;
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
return -EINVAL;
/* The marker must at least have a tag id */
if (cnt < sizeof(unsigned int))
return -EINVAL;
+ /* raw write is all or nothing */
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ return -EINVAL;
+
/* Must have preemption disabled while having access to the buffer */
guard(preempt_notrace)();
- buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size);
+ buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL);
if (!buf)
return -EFAULT;
- /* raw write is all or nothing */
- if (cnt > size)
- return -EINVAL;
-
/* The global trace_marker_raw can go to multiple instances */
if (tr == &global_trace) {
guard(rcu)();
@@ -7516,20 +7717,26 @@ static int tracing_mark_open(struct inode *inode, struct file *filp)
{
int ret;
- ret = trace_user_fault_buffer_enable();
- if (ret < 0)
- return ret;
+ scoped_guard(mutex, &trace_user_buffer_mutex) {
+ if (!trace_user_buffer) {
+ ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE);
+ if (ret < 0)
+ return ret;
+ } else {
+ trace_user_buffer->ref++;
+ }
+ }
stream_open(inode, filp);
ret = tracing_open_generic_tr(inode, filp);
if (ret < 0)
- trace_user_fault_buffer_disable();
+ user_buffer_put(&trace_user_buffer);
return ret;
}
static int tracing_mark_release(struct inode *inode, struct file *file)
{
- trace_user_fault_buffer_disable();
+ user_buffer_put(&trace_user_buffer);
return tracing_release_generic_tr(inode, file);
}
@@ -7917,6 +8124,14 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_syscall_buf_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_syscall_buf_read,
+ .write = tracing_syscall_buf_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
static const struct file_operations tracing_buffer_meta_fops = {
.open = tracing_buffer_meta_open,
.read = seq_read,
@@ -8801,8 +9016,8 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
- /* A memmap'ed buffer is not supported for user space mmap */
- if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
+ /* A memmap'ed and backup buffers are not supported for user space mmap */
+ if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC))
return -ENODEV;
ret = get_snapshot_map(iter->tr);
@@ -9315,7 +9530,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
get_tr_index(tr_index, &tr, &index);
- if (tr->trace_flags & (1 << index))
+ if (tr->trace_flags & (1ULL << index))
buf = "1\n";
else
buf = "0\n";
@@ -9344,7 +9559,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- ret = set_tracer_flag(tr, 1 << index, val);
+ ret = set_tracer_flag(tr, 1ULL << index, val);
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -9417,39 +9632,19 @@ create_trace_option_file(struct trace_array *tr,
topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
t_options, topt, &trace_options_fops);
-
}
-static void
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
+static int
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer,
+ struct tracer_flags *flags)
{
struct trace_option_dentry *topts;
struct trace_options *tr_topts;
- struct tracer_flags *flags;
struct tracer_opt *opts;
int cnt;
- int i;
-
- if (!tracer)
- return;
-
- flags = tracer->flags;
if (!flags || !flags->opts)
- return;
-
- /*
- * If this is an instance, only create flags for tracers
- * the instance may have.
- */
- if (!trace_ok_for_array(tracer, tr))
- return;
-
- for (i = 0; i < tr->nr_topts; i++) {
- /* Make sure there's no duplicate flags. */
- if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
- return;
- }
+ return 0;
opts = flags->opts;
@@ -9458,13 +9653,13 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
if (!topts)
- return;
+ return 0;
tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
GFP_KERNEL);
if (!tr_topts) {
kfree(topts);
- return;
+ return -ENOMEM;
}
tr->topts = tr_topts;
@@ -9479,6 +9674,97 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
"Failed to create trace option: %s",
opts[cnt].name);
}
+ return 0;
+}
+
+static int get_global_flags_val(struct tracer *tracer)
+{
+ struct tracers *t;
+
+ list_for_each_entry(t, &global_trace.tracers, list) {
+ if (t->tracer != tracer)
+ continue;
+ if (!t->flags)
+ return -1;
+ return t->flags->val;
+ }
+ return -1;
+}
+
+static int add_tracer_options(struct trace_array *tr, struct tracers *t)
+{
+ struct tracer *tracer = t->tracer;
+ struct tracer_flags *flags = t->flags ?: tracer->flags;
+
+ if (!flags)
+ return 0;
+
+ /* Only add tracer options after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return 0;
+
+ return create_trace_option_files(tr, tracer, flags);
+}
+
+static int add_tracer(struct trace_array *tr, struct tracer *tracer)
+{
+ struct tracer_flags *flags;
+ struct tracers *t;
+ int ret;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return 0;
+
+ /*
+ * If this is an instance, only create flags for tracers
+ * the instance may have.
+ */
+ if (!trace_ok_for_array(tracer, tr))
+ return 0;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ t->tracer = tracer;
+ t->flags = NULL;
+ list_add(&t->list, &tr->tracers);
+
+ flags = tracer->flags;
+ if (!flags) {
+ if (!tracer->default_flags)
+ return 0;
+
+ /*
+ * If the tracer defines default flags, it means the flags are
+ * per trace instance.
+ */
+ flags = kmalloc(sizeof(*flags), GFP_KERNEL);
+ if (!flags)
+ return -ENOMEM;
+
+ *flags = *tracer->default_flags;
+ flags->trace = tracer;
+
+ t->flags = flags;
+
+ /* If this is an instance, inherit the global_trace flags */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
+ int val = get_global_flags_val(tracer);
+ if (!WARN_ON_ONCE(val < 0))
+ flags->val = val;
+ }
+ }
+
+ ret = add_tracer_options(tr, t);
+ if (ret < 0) {
+ list_del(&t->list);
+ kfree(t->flags);
+ kfree(t);
+ }
+
+ return ret;
}
static struct dentry *
@@ -9508,8 +9794,9 @@ static void create_trace_options_dir(struct trace_array *tr)
for (i = 0; trace_options[i]; i++) {
if (top_level ||
- !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
+ !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) {
create_trace_option_core_file(tr, trace_options[i], i);
+ }
}
}
@@ -9830,7 +10117,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
struct trace_scratch *tscratch;
unsigned int scratch_size = 0;
- rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+ rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
@@ -9928,19 +10215,39 @@ static void init_trace_flags_index(struct trace_array *tr)
tr->trace_flags_index[i] = i;
}
-static void __update_tracer_options(struct trace_array *tr)
+static int __update_tracer(struct trace_array *tr)
{
struct tracer *t;
+ int ret = 0;
+
+ for (t = trace_types; t && !ret; t = t->next)
+ ret = add_tracer(tr, t);
- for (t = trace_types; t; t = t->next)
- add_tracer_options(tr, t);
+ return ret;
+}
+
+static __init int __update_tracer_options(struct trace_array *tr)
+{
+ struct tracers *t;
+ int ret = 0;
+
+ list_for_each_entry(t, &tr->tracers, list) {
+ ret = add_tracer_options(tr, t);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
}
-static void update_tracer_options(struct trace_array *tr)
+static __init void update_tracer_options(void)
{
+ struct trace_array *tr;
+
guard(mutex)(&trace_types_lock);
tracer_options_updated = true;
- __update_tracer_options(tr);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ __update_tracer_options(tr);
}
/* Must have trace_types_lock held */
@@ -9985,9 +10292,13 @@ static int trace_array_create_dir(struct trace_array *tr)
}
init_tracer_tracefs(tr, tr->dir);
- __update_tracer_options(tr);
-
- return ret;
+ ret = __update_tracer(tr);
+ if (ret) {
+ event_trace_del_tracer(tr);
+ tracefs_remove(tr->dir);
+ return ret;
+ }
+ return 0;
}
static struct trace_array *
@@ -10029,16 +10340,20 @@ trace_array_create_systems(const char *name, const char *systems,
raw_spin_lock_init(&tr->start_lock);
+ tr->syscall_buf_sz = global_trace.syscall_buf_sz;
+
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
#ifdef CONFIG_TRACER_MAX_TRACE
spin_lock_init(&tr->snapshot_trigger_lock);
#endif
tr->current_trace = &nop_trace;
+ tr->current_trace_flags = nop_trace.flags;
INIT_LIST_HEAD(&tr->systems);
INIT_LIST_HEAD(&tr->events);
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+ INIT_LIST_HEAD(&tr->tracers);
INIT_LIST_HEAD(&tr->marker_list);
#ifdef CONFIG_MODULES
@@ -10193,7 +10508,7 @@ static int __remove_instance(struct trace_array *tr)
/* Disable all the flags that were enabled coming in */
for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
if ((1 << i) & ZEROED_TRACE_FLAGS)
- set_tracer_flag(tr, 1 << i, 0);
+ set_tracer_flag(tr, 1ULL << i, 0);
}
if (printk_trace == tr)
@@ -10211,11 +10526,14 @@ static int __remove_instance(struct trace_array *tr)
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
clear_tracing_err_log(tr);
+ free_tracers(tr);
if (tr->range_name) {
reserve_mem_release_by_name(tr->range_name);
kfree(tr->range_name);
}
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ vfree((void *)tr->range_addr_start);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
@@ -10345,6 +10663,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_subbuf_size_fops);
+ trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
+ tr, &tracing_syscall_buf_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10630,7 +10951,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work)
create_trace_instances(NULL);
- update_tracer_options(&global_trace);
+ update_tracer_options();
}
static __init int tracer_init_tracefs(void)
@@ -10650,7 +10971,8 @@ static __init int tracer_init_tracefs(void)
tracer_init_tracefs_work_func(NULL);
}
- rv_init_interface();
+ if (rv_init_interface())
+ pr_err("RV: Error while creating the RV interface\n");
return 0;
}
@@ -10783,10 +11105,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
/* While dumping, do not allow the buffer to be enable */
tracer_tracing_disable(tr);
- old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
+ old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ);
/* don't look at user memory in panic mode */
- tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ);
if (dump_mode == DUMP_ORIG)
iter.cpu_file = raw_smp_processor_id();
@@ -11009,7 +11331,7 @@ __init static void do_allocate_snapshot(const char *name)
/*
* When allocate_snapshot is set, the next call to
* allocate_trace_buffers() (called by trace_array_get_by_name())
- * will allocate the snapshot buffer. That will alse clear
+ * will allocate the snapshot buffer. That will also clear
* this flag.
*/
allocate_snapshot = true;
@@ -11018,6 +11340,42 @@ __init static void do_allocate_snapshot(const char *name)
static inline void do_allocate_snapshot(const char *name) { }
#endif
+__init static int backup_instance_area(const char *backup,
+ unsigned long *addr, phys_addr_t *size)
+{
+ struct trace_array *backup_tr;
+ void *allocated_vaddr = NULL;
+
+ backup_tr = trace_array_get_by_name(backup, NULL);
+ if (!backup_tr) {
+ pr_warn("Tracing: Instance %s is not found.\n", backup);
+ return -ENOENT;
+ }
+
+ if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) {
+ pr_warn("Tracing: Instance %s is not boot mapped.\n", backup);
+ trace_array_put(backup_tr);
+ return -EINVAL;
+ }
+
+ *size = backup_tr->range_addr_size;
+
+ allocated_vaddr = vzalloc(*size);
+ if (!allocated_vaddr) {
+ pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n",
+ backup, (unsigned long)*size);
+ trace_array_put(backup_tr);
+ return -ENOMEM;
+ }
+
+ memcpy(allocated_vaddr,
+ (void *)backup_tr->range_addr_start, (size_t)*size);
+ *addr = (unsigned long)allocated_vaddr;
+
+ trace_array_put(backup_tr);
+ return 0;
+}
+
__init static void enable_instances(void)
{
struct trace_array *tr;
@@ -11040,11 +11398,15 @@ __init static void enable_instances(void)
char *flag_delim;
char *addr_delim;
char *rname __free(kfree) = NULL;
+ char *backup;
tok = strsep(&curr_str, ",");
- flag_delim = strchr(tok, '^');
- addr_delim = strchr(tok, '@');
+ name = strsep(&tok, "=");
+ backup = tok;
+
+ flag_delim = strchr(name, '^');
+ addr_delim = strchr(name, '@');
if (addr_delim)
*addr_delim++ = '\0';
@@ -11052,7 +11414,10 @@ __init static void enable_instances(void)
if (flag_delim)
*flag_delim++ = '\0';
- name = tok;
+ if (backup) {
+ if (backup_instance_area(backup, &addr, &size) < 0)
+ continue;
+ }
if (flag_delim) {
char *flag;
@@ -11148,7 +11513,13 @@ __init static void enable_instances(void)
tr->ref++;
}
- if (start) {
+ /*
+ * Backup buffers can be freed but need vfree().
+ */
+ if (backup)
+ tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+
+ if (start || backup) {
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
tr->range_name = no_free_ptr(rname);
}
@@ -11242,6 +11613,7 @@ __init static int tracer_alloc_buffers(void)
* just a bootstrap of current_trace anyway.
*/
global_trace.current_trace = &nop_trace;
+ global_trace.current_trace_flags = nop_trace.flags;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -11255,10 +11627,7 @@ __init static int tracer_alloc_buffers(void)
init_trace_flags_index(&global_trace);
- register_tracer(&nop_trace);
-
- /* Function tracing may start here (via kernel command line) */
- init_function_trace();
+ INIT_LIST_HEAD(&global_trace.tracers);
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -11270,6 +11639,8 @@ __init static int tracer_alloc_buffers(void)
global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+ global_trace.syscall_buf_sz = syscall_buf_size;
+
INIT_LIST_HEAD(&global_trace.systems);
INIT_LIST_HEAD(&global_trace.events);
INIT_LIST_HEAD(&global_trace.hist_vars);
@@ -11277,6 +11648,11 @@ __init static int tracer_alloc_buffers(void)
list_add(&global_trace.marker_list, &marker_copies);
list_add(&global_trace.list, &ftrace_trace_arrays);
+ register_tracer(&nop_trace);
+
+ /* Function tracing may start here (via kernel command line) */
+ init_function_trace();
+
apply_trace_boot_options();
register_snapshot_cmd();
@@ -11300,7 +11676,7 @@ out_free_buffer_mask:
#ifdef CONFIG_FUNCTION_TRACER
/* Used to set module cached ftrace filtering at boot up */
-__init struct trace_array *trace_get_global_array(void)
+struct trace_array *trace_get_global_array(void)
{
return &global_trace;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 85eabb454bee..b6d42fe06115 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@
#include <linux/ctype.h>
#include <linux/once_lite.h>
#include <linux/ftrace_regs.h>
+#include <linux/llist.h>
#include "pid_list.h"
@@ -131,6 +132,8 @@ enum trace_type {
#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
#define HIST_STACKTRACE_SKIP 5
+#define SYSCALL_FAULT_USER_MAX 165
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -216,7 +219,7 @@ struct array_buffer {
int cpu;
};
-#define TRACE_FLAGS_MAX_SIZE 32
+#define TRACE_FLAGS_MAX_SIZE 64
struct trace_options {
struct tracer *tracer;
@@ -390,7 +393,8 @@ struct trace_array {
int buffer_percent;
unsigned int n_err_log_entries;
struct tracer *current_trace;
- unsigned int trace_flags;
+ struct tracer_flags *current_trace_flags;
+ u64 trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
@@ -404,6 +408,7 @@ struct trace_array {
struct list_head systems;
struct list_head events;
struct list_head marker_list;
+ struct list_head tracers;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
/* one per_cpu trace_pipe can be opened by only one user */
@@ -430,6 +435,7 @@ struct trace_array {
int function_enabled;
#endif
int no_filter_buffering_ref;
+ unsigned int syscall_buf_sz;
struct list_head hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
struct cond_snapshot *cond_snapshot;
@@ -448,6 +454,7 @@ enum {
TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
TRACE_ARRAY_FL_MOD_INIT = BIT(3),
TRACE_ARRAY_FL_MEMMAP = BIT(4),
+ TRACE_ARRAY_FL_VMALLOC = BIT(5),
};
#ifdef CONFIG_MODULES
@@ -631,9 +638,10 @@ struct tracer {
u32 old_flags, u32 bit, int set);
/* Return 0 if OK with change, else return non-zero */
int (*flag_changed)(struct trace_array *tr,
- u32 mask, int set);
+ u64 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
+ struct tracer_flags *default_flags;
int enabled;
bool print_max;
bool allow_instances;
@@ -937,8 +945,6 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
-extern void ftrace_graph_sleep_time_control(bool enable);
-
#ifdef CONFIG_FUNCTION_PROFILER
extern void ftrace_graph_graph_time_control(bool enable);
#else
@@ -958,7 +964,8 @@ extern int __trace_graph_entry(struct trace_array *tr,
extern int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr);
+ unsigned long retaddr,
+ struct ftrace_regs *fregs);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned int trace_ctx,
@@ -1109,7 +1116,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
-extern bool fgraph_sleep_time;
+extern int fgraph_no_sleep_time;
+extern bool fprofile_no_sleep_time;
static inline bool
ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
@@ -1154,11 +1162,6 @@ struct ftrace_func_command {
char *params, int enable);
};
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct trace_array *tr)
-{
- return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
- FTRACE_PID_IGNORE;
-}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent);
@@ -1176,10 +1179,6 @@ void ftrace_clear_pids(struct trace_array *tr);
int init_function_trace(void);
void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
#else
-static inline int ftrace_trace_task(struct trace_array *tr)
-{
- return 1;
-}
static inline int ftrace_is_dead(void) { return 0; }
static inline int
ftrace_create_function_files(struct trace_array *tr,
@@ -1345,11 +1344,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
# define FUNCTION_FLAGS \
C(FUNCTION, "function-trace"), \
C(FUNC_FORK, "function-fork"),
-# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
+# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION)
#else
# define FUNCTION_FLAGS
# define FUNCTION_DEFAULT_FLAGS 0UL
-# define TRACE_ITER_FUNC_FORK 0UL
+# define TRACE_ITER_FUNC_FORK_BIT -1
#endif
#ifdef CONFIG_STACKTRACE
@@ -1359,6 +1358,24 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
# define STACK_FLAGS
#endif
+#ifdef CONFIG_FUNCTION_PROFILER
+# define PROFILER_FLAGS \
+ C(PROF_TEXT_OFFSET, "prof-text-offset"),
+# ifdef CONFIG_FUNCTION_GRAPH_TRACER
+# define FPROFILE_FLAGS \
+ C(GRAPH_TIME, "graph-time"),
+# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME)
+# else
+# define FPROFILE_FLAGS
+# define FPROFILE_DEFAULT_FLAGS 0UL
+# endif
+#else
+# define PROFILER_FLAGS
+# define FPROFILE_FLAGS
+# define FPROFILE_DEFAULT_FLAGS 0UL
+# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1
+#endif
+
/*
* trace_iterator_flags is an enumeration that defines bit
* positions into trace_flags that controls the output.
@@ -1391,13 +1408,15 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(TRACE_PRINTK, "trace_printk_dest"), \
- C(COPY_MARKER, "copy_trace_marker"),\
+ C(COPY_MARKER, "copy_trace_marker"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ PROFILER_FLAGS \
+ FPROFILE_FLAGS
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
@@ -1413,20 +1432,17 @@ enum trace_iterator_bits {
};
/*
- * By redefining C, we can make TRACE_FLAGS a list of masks that
- * use the bits as defined above.
+ * And use TRACE_ITER(flag) to define the bit masks.
*/
-#undef C
-#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT)
-
-enum trace_iterator_flags { TRACE_FLAGS };
+#define TRACE_ITER(flag) \
+ (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT))
/*
* TRACE_ITER_SYM_MASK masks the options in trace_flags that
* control the output of kernel symbols.
*/
#define TRACE_ITER_SYM_MASK \
- (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+ (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR))
extern struct tracer nop_trace;
@@ -1435,7 +1451,7 @@ extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
static inline int trace_branch_enable(struct trace_array *tr)
{
- if (tr->trace_flags & TRACE_ITER_BRANCH)
+ if (tr->trace_flags & TRACE_ITER(BRANCH))
return enable_branch_tracing(tr);
return 0;
}
@@ -1531,6 +1547,23 @@ void trace_buffered_event_enable(void);
void early_enable_events(struct trace_array *tr, char *buf, bool disable_first);
+struct trace_user_buf;
+struct trace_user_buf_info {
+ struct trace_user_buf __percpu *tbuf;
+ size_t size;
+ int ref;
+};
+
+typedef int (*trace_user_buf_copy)(char *dst, const char __user *src,
+ size_t size, void *data);
+int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size);
+int trace_user_fault_get(struct trace_user_buf_info *tinfo);
+int trace_user_fault_put(struct trace_user_buf_info *tinfo);
+void trace_user_fault_destroy(struct trace_user_buf_info *tinfo);
+char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
+ const char __user *ptr, size_t size,
+ trace_user_buf_copy copy_func, void *data);
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
@@ -1752,13 +1785,13 @@ extern void clear_event_triggers(struct trace_array *tr);
enum {
EVENT_TRIGGER_FL_PROBE = BIT(0),
+ EVENT_TRIGGER_FL_COUNT = BIT(1),
};
struct event_trigger_data {
unsigned long count;
int ref;
int flags;
- const struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
char *filter_str;
@@ -1769,6 +1802,7 @@ struct event_trigger_data {
char *name;
struct list_head named_list;
struct event_trigger_data *named_data;
+ struct llist_node llist;
};
/* Avoid typos */
@@ -1783,6 +1817,10 @@ struct enable_trigger_data {
bool hist;
};
+bool event_trigger_count(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event);
+
extern int event_enable_trigger_print(struct seq_file *m,
struct event_trigger_data *data);
extern void event_enable_trigger_free(struct event_trigger_data *data);
@@ -1846,64 +1884,6 @@ extern void event_file_get(struct trace_event_file *file);
extern void event_file_put(struct trace_event_file *file);
/**
- * struct event_trigger_ops - callbacks for trace event triggers
- *
- * The methods in this structure provide per-event trigger hooks for
- * various trigger operations.
- *
- * The @init and @free methods are used during trigger setup and
- * teardown, typically called from an event_command's @parse()
- * function implementation.
- *
- * The @print method is used to print the trigger spec.
- *
- * The @trigger method is the function that actually implements the
- * trigger and is called in the context of the triggering event
- * whenever that event occurs.
- *
- * All the methods below, except for @init() and @free(), must be
- * implemented.
- *
- * @trigger: The trigger 'probe' function called when the triggering
- * event occurs. The data passed into this callback is the data
- * that was supplied to the event_command @reg() function that
- * registered the trigger (see struct event_command) along with
- * the trace record, rec.
- *
- * @init: An optional initialization function called for the trigger
- * when the trigger is registered (via the event_command reg()
- * function). This can be used to perform per-trigger
- * initialization such as incrementing a per-trigger reference
- * count, for instance. This is usually implemented by the
- * generic utility function @event_trigger_init() (see
- * trace_event_triggers.c).
- *
- * @free: An optional de-initialization function called for the
- * trigger when the trigger is unregistered (via the
- * event_command @reg() function). This can be used to perform
- * per-trigger de-initialization such as decrementing a
- * per-trigger reference count and freeing corresponding trigger
- * data, for instance. This is usually implemented by the
- * generic utility function @event_trigger_free() (see
- * trace_event_triggers.c).
- *
- * @print: The callback function invoked to have the trigger print
- * itself. This is usually implemented by a wrapper function
- * that calls the generic utility function @event_trigger_print()
- * (see trace_event_triggers.c).
- */
-struct event_trigger_ops {
- void (*trigger)(struct event_trigger_data *data,
- struct trace_buffer *buffer,
- void *rec,
- struct ring_buffer_event *rbe);
- int (*init)(struct event_trigger_data *data);
- void (*free)(struct event_trigger_data *data);
- int (*print)(struct seq_file *m,
- struct event_trigger_data *data);
-};
-
-/**
* struct event_command - callbacks and data members for event commands
*
* Event commands are invoked by users by writing the command name
@@ -1952,7 +1932,7 @@ struct event_trigger_ops {
*
* @reg: Adds the trigger to the list of triggers associated with the
* event, and enables the event trigger itself, after
- * initializing it (via the event_trigger_ops @init() function).
+ * initializing it (via the event_command @init() function).
* This is also where commands can use the @trigger_type value to
* make the decision as to whether or not multiple instances of
* the trigger should be allowed. This is usually implemented by
@@ -1961,7 +1941,7 @@ struct event_trigger_ops {
*
* @unreg: Removes the trigger from the list of triggers associated
* with the event, and disables the event trigger itself, after
- * initializing it (via the event_trigger_ops @free() function).
+ * initializing it (via the event_command @free() function).
* This is usually implemented by the generic utility function
* @unregister_trigger() (see trace_event_triggers.c).
*
@@ -1975,12 +1955,41 @@ struct event_trigger_ops {
* ignored. This is usually implemented by the generic utility
* function @set_trigger_filter() (see trace_event_triggers.c).
*
- * @get_trigger_ops: The callback function invoked to retrieve the
- * event_trigger_ops implementation associated with the command.
- * This callback function allows a single event_command to
- * support multiple trigger implementations via different sets of
- * event_trigger_ops, depending on the value of the @param
- * string.
+ * All the methods below, except for @init() and @free(), must be
+ * implemented.
+ *
+ * @trigger: The trigger 'probe' function called when the triggering
+ * event occurs. The data passed into this callback is the data
+ * that was supplied to the event_command @reg() function that
+ * registered the trigger (see struct event_command) along with
+ * the trace record, rec.
+ *
+ * @count_func: If defined and a numeric parameter is passed to the
+ * trigger, then this function will be called before @trigger
+ * is called. If this function returns false, then @trigger is not
+ * executed.
+ *
+ * @init: An optional initialization function called for the trigger
+ * when the trigger is registered (via the event_command reg()
+ * function). This can be used to perform per-trigger
+ * initialization such as incrementing a per-trigger reference
+ * count, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_init() (see
+ * trace_event_triggers.c).
+ *
+ * @free: An optional de-initialization function called for the
+ * trigger when the trigger is unregistered (via the
+ * event_command @reg() function). This can be used to perform
+ * per-trigger de-initialization such as decrementing a
+ * per-trigger reference count and freeing corresponding trigger
+ * data, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_free() (see
+ * trace_event_triggers.c).
+ *
+ * @print: The callback function invoked to have the trigger print
+ * itself. This is usually implemented by a wrapper function
+ * that calls the generic utility function @event_trigger_print()
+ * (see trace_event_triggers.c).
*/
struct event_command {
struct list_head list;
@@ -2001,7 +2010,18 @@ struct event_command {
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
struct trace_event_file *file);
- const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+ void (*trigger)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
+ bool (*count_func)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
+ int (*init)(struct event_trigger_data *data);
+ void (*free)(struct event_trigger_data *data);
+ int (*print)(struct seq_file *m,
+ struct event_trigger_data *data);
};
/**
@@ -2022,7 +2042,7 @@ struct event_command {
* either committed or discarded. At that point, if any commands
* have deferred their triggers, those commands are finally
* invoked following the close of the current event. In other
- * words, if the event_trigger_ops @func() probe implementation
+ * words, if the event_command @func() probe implementation
* itself logs to the trace buffer, this flag should be set,
* otherwise it can be left unspecified.
*
@@ -2064,8 +2084,8 @@ extern const char *__stop___tracepoint_str[];
void trace_printk_control(bool enabled);
void trace_printk_start_comm(void);
-int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
-int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set);
+int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled);
/* Used from boot time tracer */
extern int trace_set_options(struct trace_array *tr, char *option);
@@ -2248,4 +2268,25 @@ static inline int rv_init_interface(void)
*/
#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX)
+/*
+ * This is used to get the address of the args array based on
+ * the type of the entry.
+ */
+#define FGRAPH_ENTRY_ARGS(e) \
+ ({ \
+ unsigned long *_args; \
+ struct ftrace_graph_ent_entry *_e = e; \
+ \
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \
+ e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \
+ struct fgraph_retaddr_ent_entry *_re; \
+ \
+ _re = (typeof(_re))_e; \
+ _args = _re->args; \
+ } else { \
+ _args = _e->args; \
+ } \
+ _args; \
+ })
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index d06854bd32b3..c4dfbc293bae 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -144,9 +144,16 @@ static int create_dyn_event(const char *raw_command)
if (!ret || ret != -ECANCELED)
break;
}
- mutex_unlock(&dyn_event_ops_mutex);
- if (ret == -ECANCELED)
+ if (ret == -ECANCELED) {
+ static const char *err_msg[] = {"No matching dynamic event type"};
+
+ /* Wrong dynamic event. Leave an error message. */
+ tracing_log_err(NULL, "dynevent", raw_command, err_msg,
+ 0, 0);
ret = -EINVAL;
+ }
+
+ mutex_unlock(&dyn_event_ops_mutex);
return ret;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index de294ae2c5c5..f6a8d29c0d76 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
__field_packed( unsigned long, graph_ent, func )
- __field_packed( unsigned int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, depth )
__dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth)
+ F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth)
);
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
@@ -95,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
TRACE_GRAPH_RETADDR_ENT,
F_STRUCT(
- __field_struct( struct fgraph_retaddr_ent, graph_ent )
- __field_packed( unsigned long, graph_ent, func )
- __field_packed( unsigned int, graph_ent, depth )
- __field_packed( unsigned long, graph_ent, retaddr )
+ __field_struct( struct fgraph_retaddr_ent, graph_rent )
+ __field_packed( unsigned long, graph_rent.ent, func )
+ __field_packed( unsigned long, graph_rent.ent, depth )
+ __field_packed( unsigned long, graph_rent, retaddr )
+ __dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth,
+ F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth,
(void *)__entry->retaddr)
);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index a1d402124836..3ee39715d5e4 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -61,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep)
kfree(ep);
}
+DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *,
+ if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T))
+
static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
{
return container_of(ev, struct trace_eprobe, devent);
@@ -197,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
struct trace_event_call *event,
int nargs)
{
- struct trace_eprobe *ep;
+ struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL;
const char *event_name;
const char *sys_name;
- int ret = -ENOMEM;
+ int ret;
if (!event)
return ERR_PTR(-ENODEV);
@@ -211,25 +214,22 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
if (!ep) {
trace_event_put_ref(event);
- goto error;
+ return ERR_PTR(-ENOMEM);
}
ep->event = event;
ep->event_name = kstrdup(event_name, GFP_KERNEL);
if (!ep->event_name)
- goto error;
+ return ERR_PTR(-ENOMEM);
ep->event_system = kstrdup(sys_name, GFP_KERNEL);
if (!ep->event_system)
- goto error;
+ return ERR_PTR(-ENOMEM);
ret = trace_probe_init(&ep->tp, this_event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
- return ep;
-error:
- trace_event_probe_cleanup(ep);
- return ERR_PTR(ret);
+ return_ptr(ep);
}
static int eprobe_event_define_fields(struct trace_event_call *event_call)
@@ -484,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
__eprobe_trace_func(edata, rec);
}
-static const struct event_trigger_ops eprobe_trigger_ops = {
- .trigger = eprobe_trigger_func,
- .print = eprobe_trigger_print,
- .init = eprobe_trigger_init,
- .free = eprobe_trigger_free,
-};
-
static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob, char *cmd,
@@ -513,12 +506,6 @@ static void eprobe_trigger_unreg_func(char *glob,
}
-static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
- char *param)
-{
- return &eprobe_trigger_ops;
-}
-
static struct event_command event_trigger_cmd = {
.name = "eprobe",
.trigger_type = ETT_EVENT_EPROBE,
@@ -527,8 +514,11 @@ static struct event_command event_trigger_cmd = {
.reg = eprobe_trigger_reg_func,
.unreg = eprobe_trigger_unreg_func,
.unreg_all = NULL,
- .get_trigger_ops = eprobe_trigger_get_ops,
.set_filter = NULL,
+ .trigger = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
};
static struct event_trigger_data *
@@ -548,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
trigger->flags = EVENT_TRIGGER_FL_PROBE;
trigger->count = -1;
- trigger->ops = &eprobe_trigger_ops;
/*
* EVENT PROBE triggers are not registered as commands with
@@ -801,25 +790,6 @@ find_and_get_event(const char *system, const char *event_name)
return NULL;
}
-static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
-{
- struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
- int ret;
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- ctx->event = ep->event;
- ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT;
-
- ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx);
- /* Handle symbols "@" */
- if (!ret)
- ret = traceprobe_update_arg(&ep->tp.args[i]);
-
- return ret;
-}
-
static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
{
struct event_filter *dummy = NULL;
@@ -856,13 +826,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch
ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
true, &dummy);
free_event_filter(dummy);
- if (ret)
- goto error;
-
- return 0;
-error:
- kfree(ep->filter_str);
- ep->filter_str = NULL;
+ if (ret) {
+ kfree(ep->filter_str);
+ ep->filter_str = NULL;
+ }
return ret;
}
@@ -874,31 +841,33 @@ static int __trace_eprobe_create(int argc, const char *argv[])
* Fetch args (no space):
* <name>=$<field>[:TYPE]
*/
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL;
+ const char *trlog __free(trace_probe_log_clear) = NULL;
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
const char *sys_event = NULL, *sys_name = NULL;
struct trace_event_call *event_call;
char *buf1 __free(kfree) = NULL;
char *buf2 __free(kfree) = NULL;
char *gbuf __free(kfree) = NULL;
- struct trace_eprobe *ep = NULL;
int ret = 0, filter_idx = 0;
int i, filter_cnt;
if (argc < 2 || argv[0][0] != 'e')
return -ECANCELED;
- trace_probe_log_init("event_probe", argc, argv);
+ trlog = trace_probe_log_init("event_probe", argc, argv);
event = strchr(&argv[0][1], ':');
if (event) {
gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
if (!gbuf)
- goto mem_error;
+ return -ENOMEM;
event++;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
trace_probe_log_set_index(1);
@@ -906,18 +875,18 @@ static int __trace_eprobe_create(int argc, const char *argv[])
buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
if (!buf2)
- goto mem_error;
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
if (ret || !sys_event || !sys_name) {
trace_probe_log_err(0, NO_EVENT_INFO);
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
buf1 = kstrdup(sys_event, GFP_KERNEL);
if (!buf1)
- goto mem_error;
+ return -ENOMEM;
event = buf1;
}
@@ -933,8 +902,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (argc - 2 > MAX_TRACE_ARGS) {
trace_probe_log_set_index(2);
trace_probe_log_err(0, TOO_MANY_ARGS);
- ret = -E2BIG;
- goto error;
+ return -E2BIG;
}
scoped_guard(mutex, &event_mutex) {
@@ -948,29 +916,39 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_ATTACH_EVENT);
/* This must return -ENOMEM or missing event, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
- ep = NULL;
- goto error;
+ return ret;
}
if (filter_idx) {
trace_probe_log_set_index(filter_idx);
ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
if (ret)
- goto parse_error;
+ return -EINVAL;
} else
ep->filter_str = NULL;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->event = ep->event;
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT;
+
argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ret = trace_eprobe_tp_update_arg(ep, argv, i);
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx);
+ /* Handle symbols "@" */
+ if (!ret)
+ ret = traceprobe_update_arg(&ep->tp.args[i]);
if (ret)
- goto error;
+ return ret;
}
ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
if (ret < 0)
- goto error;
+ return ret;
+
init_trace_eprobe_call(ep);
scoped_guard(mutex, &event_mutex) {
ret = trace_probe_register_event_call(&ep->tp);
@@ -979,25 +957,16 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(0);
trace_probe_log_err(0, EVENT_EXIST);
}
- goto error;
+ return ret;
}
ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
if (ret < 0) {
trace_probe_unregister_event_call(&ep->tp);
- goto error;
+ return ret;
}
+ /* To avoid freeing registered eprobe event, clear ep. */
+ ep = NULL;
}
- trace_probe_log_clear();
- return ret;
-
-mem_error:
- ret = -ENOMEM;
- goto error;
-parse_error:
- ret = -EINVAL;
-error:
- trace_probe_log_clear();
- trace_event_probe_cleanup(ep);
return ret;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e00da4182deb..b16a5a158040 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca
/* Anything else, this isn't a function */
break;
}
- /* A function could be wrapped in parethesis, try the next one */
+ /* A function could be wrapped in parenthesis, try the next one */
s = r + 1;
} while (s < e);
@@ -567,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call)
* If start_arg is zero, then this is the start of the
* first argument. The processing of the argument happens
* when the end of the argument is found, as it needs to
- * handle paranthesis and such.
+ * handle parenthesis and such.
*/
if (!start_arg) {
start_arg = i;
@@ -785,7 +785,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
*
* When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
- * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ * "soft enable"s (clearing the SOFT_DISABLED bit) won't work.
*/
if (soft_disable) {
if (atomic_dec_return(&file->sm_ref) > 0)
@@ -845,13 +845,13 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
- if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) {
cmd = true;
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
- if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
tgid = true;
tracing_start_tgid_record();
set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
@@ -1394,7 +1394,7 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
if (!tr)
return -ENOENT;
- /* Modules events can be appened with :mod:<module> */
+ /* Modules events can be appended with :mod:<module> */
mod = strstr(buf, ":mod:");
if (mod) {
*mod = '\0';
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 54226b48b2d1..385af8405392 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -142,7 +142,7 @@ static bool is_not(const char *str)
}
/**
- * struct prog_entry - a singe entry in the filter program
+ * struct prog_entry - a single entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6bfaf1210dd2..5e6e70540eef 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5283,7 +5283,7 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
* on the stack, so when the histogram trigger is initialized
* a percpu array of 4 hist_pad structures is allocated.
* This will cover every context from normal, softirq, irq and NMI
- * in the very unlikely event that a tigger happens at each of
+ * in the very unlikely event that a trigger happens at each of
* these contexts and interrupts a currently active trigger.
*/
struct hist_pad {
@@ -5696,7 +5696,7 @@ static void hist_trigger_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -6018,7 +6018,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -6328,20 +6328,21 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
free_hist_pad();
}
-static const struct event_trigger_ops event_hist_trigger_ops = {
- .trigger = event_hist_trigger,
- .print = event_hist_trigger_print,
- .init = event_hist_trigger_init,
- .free = event_hist_trigger_free,
-};
-
static int event_hist_trigger_named_init(struct event_trigger_data *data)
{
+ int ret;
+
data->ref++;
save_named_trigger(data->named_data->name, data);
- return event_hist_trigger_init(data->named_data);
+ ret = event_hist_trigger_init(data->named_data);
+ if (ret < 0) {
+ kfree(data->cmd_ops);
+ data->cmd_ops = &trigger_hist_cmd;
+ }
+
+ return ret;
}
static void event_hist_trigger_named_free(struct event_trigger_data *data)
@@ -6353,24 +6354,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data)
data->ref--;
if (!data->ref) {
+ struct event_command *cmd_ops = data->cmd_ops;
+
del_named_trigger(data);
trigger_data_free(data);
+ kfree(cmd_ops);
}
}
-static const struct event_trigger_ops event_hist_trigger_named_ops = {
- .trigger = event_hist_trigger,
- .print = event_hist_trigger_print,
- .init = event_hist_trigger_named_init,
- .free = event_hist_trigger_named_free,
-};
-
-static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
- char *param)
-{
- return &event_hist_trigger_ops;
-}
-
static void hist_clear(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -6564,13 +6555,24 @@ static int hist_register_trigger(char *glob,
data->paused = true;
if (named_data) {
+ struct event_command *cmd_ops;
+
data->private_data = named_data->private_data;
set_named_trigger_data(data, named_data);
- data->ops = &event_hist_trigger_named_ops;
+ /* Copy the command ops and update some of the functions */
+ cmd_ops = kmalloc(sizeof(*cmd_ops), GFP_KERNEL);
+ if (!cmd_ops) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ *cmd_ops = *data->cmd_ops;
+ cmd_ops->init = event_hist_trigger_named_init;
+ cmd_ops->free = event_hist_trigger_named_free;
+ data->cmd_ops = cmd_ops;
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
goto out;
}
@@ -6684,8 +6686,8 @@ static void hist_unregister_trigger(char *glob,
}
}
- if (test && test->ops->free)
- test->ops->free(test);
+ if (test && test->cmd_ops->free)
+ test->cmd_ops->free(test);
if (hist_data->enable_timestamps) {
if (!hist_data->remove || test)
@@ -6737,8 +6739,8 @@ static void hist_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
if (hist_data->enable_timestamps)
tracing_set_filter_buffering(file->tr, false);
- if (test->ops->free)
- test->ops->free(test);
+ if (test->cmd_ops->free)
+ test->cmd_ops->free(test);
}
}
}
@@ -6914,8 +6916,11 @@ static struct event_command trigger_hist_cmd = {
.reg = hist_register_trigger,
.unreg = hist_unregister_trigger,
.unreg_all = hist_unreg_all,
- .get_trigger_ops = event_hist_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_hist_trigger,
+ .print = event_hist_trigger_print,
+ .init = event_hist_trigger_init,
+ .free = event_hist_trigger_free,
};
__init int register_trigger_hist_cmd(void)
@@ -6947,66 +6952,6 @@ hist_enable_trigger(struct event_trigger_data *data,
}
}
-static void
-hist_enable_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
- return;
-
- if (data->count != -1)
- (data->count)--;
-
- hist_enable_trigger(data, buffer, rec, event);
-}
-
-static const struct event_trigger_ops hist_enable_trigger_ops = {
- .trigger = hist_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops hist_enable_count_trigger_ops = {
- .trigger = hist_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops hist_disable_trigger_ops = {
- .trigger = hist_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops hist_disable_count_trigger_ops = {
- .trigger = hist_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops *
-hist_enable_get_trigger_ops(char *cmd, char *param)
-{
- const struct event_trigger_ops *ops;
- bool enable;
-
- enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
-
- if (enable)
- ops = param ? &hist_enable_count_trigger_ops :
- &hist_enable_trigger_ops;
- else
- ops = param ? &hist_disable_count_trigger_ops :
- &hist_disable_trigger_ops;
-
- return ops;
-}
-
static void hist_enable_unreg_all(struct trace_event_file *file)
{
struct event_trigger_data *test, *n;
@@ -7016,8 +6961,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
list_del_rcu(&test->list);
update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
- if (test->ops->free)
- test->ops->free(test);
+ if (test->cmd_ops->free)
+ test->cmd_ops->free(test);
}
}
}
@@ -7029,8 +6974,12 @@ static struct event_command trigger_hist_enable_cmd = {
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
- .get_trigger_ops = hist_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = hist_enable_trigger,
+ .count_func = event_trigger_count,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static struct event_command trigger_hist_disable_cmd = {
@@ -7040,8 +6989,12 @@ static struct event_command trigger_hist_disable_cmd = {
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
- .get_trigger_ops = hist_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = hist_enable_trigger,
+ .count_func = event_trigger_count,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static __init void unregister_trigger_hist_enable_disable_cmds(void)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index f24ee61f8884..4554c458b78c 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -359,7 +359,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
fmt = synth_field_fmt(se->fields[i]->type);
/* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
trace_seq_printf(s, "%s ", fmt);
snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
@@ -375,7 +375,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cbfc306c0159..06b75bcfc7b8 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -6,6 +6,7 @@
*/
#include <linux/security.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -17,15 +18,77 @@
static LIST_HEAD(trigger_commands);
static DEFINE_MUTEX(trigger_cmd_mutex);
+static struct task_struct *trigger_kthread;
+static struct llist_head trigger_data_free_list;
+static DEFINE_MUTEX(trigger_data_kthread_mutex);
+
+/* Bulk garbage collection of event_trigger_data elements */
+static int trigger_kthread_fn(void *ignore)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ /* Once this task starts, it lives forever */
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (llist_empty(&trigger_data_free_list))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+
+ /* make sure current triggers exit before free */
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+ }
+
+ return 0;
+}
+
void trigger_data_free(struct event_trigger_data *data)
{
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
- /* make sure current triggers exit before free */
- tracepoint_synchronize_unregister();
+ if (unlikely(!trigger_kthread)) {
+ guard(mutex)(&trigger_data_kthread_mutex);
+ /* Check again after taking mutex */
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+ }
- kfree(data);
+ if (!trigger_kthread) {
+ /* Do it the slow way */
+ tracepoint_synchronize_unregister();
+ kfree(data);
+ return;
+ }
+
+ llist_add(&data->llist, &trigger_data_free_list);
+ wake_up_process(trigger_kthread);
+}
+
+static inline void data_ops_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
+{
+ const struct event_command *cmd_ops = data->cmd_ops;
+
+ if (data->flags & EVENT_TRIGGER_FL_COUNT) {
+ if (!cmd_ops->count_func(data, buffer, rec, event))
+ return;
+ }
+
+ cmd_ops->trigger(data, buffer, rec, event);
}
/**
@@ -70,7 +133,7 @@ event_triggers_call(struct trace_event_file *file,
if (data->paused)
continue;
if (!rec) {
- data->ops->trigger(data, buffer, rec, event);
+ data_ops_trigger(data, buffer, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -80,7 +143,7 @@ event_triggers_call(struct trace_event_file *file,
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->trigger(data, buffer, rec, event);
+ data_ops_trigger(data, buffer, rec, event);
}
return tt;
}
@@ -122,7 +185,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->trigger(data, NULL, NULL, NULL);
+ data_ops_trigger(data, NULL, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -191,7 +254,7 @@ static int trigger_show(struct seq_file *m, void *v)
}
data = list_entry(v, struct event_trigger_data, list);
- data->ops->print(m, data);
+ data->cmd_ops->print(m, data);
return 0;
}
@@ -245,7 +308,8 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
char *command, *next;
struct event_command *p;
- next = buff = skip_spaces(buff);
+ next = buff = strim(buff);
+
command = strsep(&next, ": \t");
if (next) {
next = skip_spaces(next);
@@ -282,8 +346,6 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (IS_ERR(buf))
return PTR_ERR(buf);
- strim(buf);
-
guard(mutex)(&event_mutex);
event_file = event_file_file(file);
@@ -300,13 +362,9 @@ static ssize_t event_trigger_regex_write(struct file *file,
static int event_trigger_regex_release(struct inode *inode, struct file *file)
{
- mutex_lock(&event_mutex);
-
if (file->f_mode & FMODE_READ)
seq_release(inode, file);
- mutex_unlock(&event_mutex);
-
return 0;
}
@@ -378,7 +436,37 @@ __init int unregister_event_command(struct event_command *cmd)
}
/**
- * event_trigger_print - Generic event_trigger_ops @print implementation
+ * event_trigger_count - Optional count function for event triggers
+ * @data: Trigger-specific data
+ * @buffer: The ring buffer that the event is being written to
+ * @rec: The trace entry for the event, NULL for unconditional invocation
+ * @event: The event meta data in the ring buffer
+ *
+ * For triggers that can take a count parameter that doesn't do anything
+ * special, they can use this function to assign to their .count_func
+ * field.
+ *
+ * This simply does a count down of the @data->count field.
+ *
+ * If the @data->count is greater than zero, it will decrement it.
+ *
+ * Returns false if @data->count is zero, otherwise true.
+ */
+bool event_trigger_count(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
+{
+ if (!data->count)
+ return false;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ return true;
+}
+
+/**
+ * event_trigger_print - Generic event_command @print implementation
* @name: The name of the event trigger
* @m: The seq_file being printed to
* @data: Trigger-specific data
@@ -413,7 +501,7 @@ event_trigger_print(const char *name, struct seq_file *m,
}
/**
- * event_trigger_init - Generic event_trigger_ops @init implementation
+ * event_trigger_init - Generic event_command @init implementation
* @data: Trigger-specific data
*
* Common implementation of event trigger initialization.
@@ -430,7 +518,7 @@ int event_trigger_init(struct event_trigger_data *data)
}
/**
- * event_trigger_free - Generic event_trigger_ops @free implementation
+ * event_trigger_free - Generic event_command @free implementation
* @data: Trigger-specific data
*
* Common implementation of event trigger de-initialization.
@@ -492,8 +580,8 @@ clear_event_triggers(struct trace_array *tr)
list_for_each_entry_safe(data, n, &file->triggers, list) {
trace_event_trigger_enable_disable(file, 0);
list_del_rcu(&data->list);
- if (data->ops->free)
- data->ops->free(data);
+ if (data->cmd_ops->free)
+ data->cmd_ops->free(data);
}
}
}
@@ -556,8 +644,8 @@ static int register_trigger(char *glob,
return -EEXIST;
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
return ret;
}
@@ -595,8 +683,8 @@ static bool try_unregister_trigger(char *glob,
}
if (data) {
- if (data->ops->free)
- data->ops->free(data);
+ if (data->cmd_ops->free)
+ data->cmd_ops->free(data);
return true;
}
@@ -644,7 +732,7 @@ static void unregister_trigger(char *glob,
* param - text following cmd and ':' and stripped of filter
* filter - the optional filter text following (and including) 'if'
*
- * To illustrate the use of these componenents, here are some concrete
+ * To illustrate the use of these components, here are some concrete
* examples. For the following triggers:
*
* echo 'traceon:5 if pid == 0' > trigger
@@ -807,9 +895,13 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
* @private_data: User data to associate with the event trigger
*
* Allocate an event_trigger_data instance and initialize it. The
- * @cmd_ops are used along with the @cmd and @param to get the
- * trigger_ops to assign to the event_trigger_data. @private_data can
- * also be passed in and associated with the event_trigger_data.
+ * @cmd_ops defines how the trigger will operate. If @param is set,
+ * and @cmd_ops->trigger_ops->count_func is non NULL, then the
+ * data->count is set to @param and before the trigger is executed, the
+ * @cmd_ops->trigger_ops->count_func() is called. If that function returns
+ * false, the @cmd_ops->trigger_ops->trigger() function will not be called.
+ * @private_data can also be passed in and associated with the
+ * event_trigger_data.
*
* Use trigger_data_free() to free an event_trigger_data object.
*
@@ -821,18 +913,16 @@ struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops,
void *private_data)
{
struct event_trigger_data *trigger_data;
- const struct event_trigger_ops *trigger_ops;
-
- trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
if (!trigger_data)
return NULL;
trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
trigger_data->private_data = private_data;
+ if (param && cmd_ops->count_func)
+ trigger_data->flags |= EVENT_TRIGGER_FL_COUNT;
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
@@ -1271,31 +1361,28 @@ traceon_trigger(struct event_trigger_data *data,
tracing_on();
}
-static void
-traceon_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+traceon_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
if (file) {
if (tracer_tracing_is_on(file->tr))
- return;
+ return false;
} else {
if (tracing_is_on())
- return;
+ return false;
}
if (!data->count)
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- if (file)
- tracer_tracing_on(file->tr);
- else
- tracing_on();
+ return true;
}
static void
@@ -1319,31 +1406,28 @@ traceoff_trigger(struct event_trigger_data *data,
tracing_off();
}
-static void
-traceoff_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+traceoff_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
if (file) {
if (!tracer_tracing_is_on(file->tr))
- return;
+ return false;
} else {
if (!tracing_is_on())
- return;
+ return false;
}
if (!data->count)
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- if (file)
- tracer_tracing_off(file->tr);
- else
- tracing_off();
+ return true;
}
static int
@@ -1360,58 +1444,18 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static const struct event_trigger_ops traceon_trigger_ops = {
- .trigger = traceon_trigger,
- .print = traceon_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops traceon_count_trigger_ops = {
- .trigger = traceon_count_trigger,
- .print = traceon_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops traceoff_trigger_ops = {
- .trigger = traceoff_trigger,
- .print = traceoff_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops traceoff_count_trigger_ops = {
- .trigger = traceoff_count_trigger,
- .print = traceoff_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops *
-onoff_get_trigger_ops(char *cmd, char *param)
-{
- const struct event_trigger_ops *ops;
-
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = param ? &traceon_count_trigger_ops :
- &traceon_trigger_ops;
- else
- ops = param ? &traceoff_count_trigger_ops :
- &traceoff_trigger_ops;
-
- return ops;
-}
-
static struct event_command trigger_traceon_cmd = {
.name = "traceon",
.trigger_type = ETT_TRACE_ONOFF,
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = onoff_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = traceon_trigger,
+ .count_func = traceon_count_func,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static struct event_command trigger_traceoff_cmd = {
@@ -1421,8 +1465,12 @@ static struct event_command trigger_traceoff_cmd = {
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = onoff_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = traceoff_trigger,
+ .count_func = traceoff_count_func,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -1439,20 +1487,6 @@ snapshot_trigger(struct event_trigger_data *data,
tracing_snapshot();
}
-static void
-snapshot_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
- return;
-
- if (data->count != -1)
- (data->count)--;
-
- snapshot_trigger(data, buffer, rec, event);
-}
-
static int
register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
@@ -1484,34 +1518,18 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static const struct event_trigger_ops snapshot_trigger_ops = {
- .trigger = snapshot_trigger,
- .print = snapshot_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops snapshot_count_trigger_ops = {
- .trigger = snapshot_count_trigger,
- .print = snapshot_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops *
-snapshot_get_trigger_ops(char *cmd, char *param)
-{
- return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
-}
-
static struct event_command trigger_snapshot_cmd = {
.name = "snapshot",
.trigger_type = ETT_SNAPSHOT,
.parse = event_trigger_parse,
.reg = register_snapshot_trigger,
.unreg = unregister_snapshot_trigger,
- .get_trigger_ops = snapshot_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = snapshot_trigger,
+ .count_func = event_trigger_count,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static __init int register_trigger_snapshot_cmd(void)
@@ -1558,20 +1576,6 @@ stacktrace_trigger(struct event_trigger_data *data,
trace_dump_stack(STACK_SKIP);
}
-static void
-stacktrace_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
-{
- if (!data->count)
- return;
-
- if (data->count != -1)
- (data->count)--;
-
- stacktrace_trigger(data, buffer, rec, event);
-}
-
static int
stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
@@ -1579,26 +1583,6 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static const struct event_trigger_ops stacktrace_trigger_ops = {
- .trigger = stacktrace_trigger,
- .print = stacktrace_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops stacktrace_count_trigger_ops = {
- .trigger = stacktrace_count_trigger,
- .print = stacktrace_trigger_print,
- .init = event_trigger_init,
- .free = event_trigger_free,
-};
-
-static const struct event_trigger_ops *
-stacktrace_get_trigger_ops(char *cmd, char *param)
-{
- return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
-}
-
static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
@@ -1606,8 +1590,12 @@ static struct event_command trigger_stacktrace_cmd = {
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
- .get_trigger_ops = stacktrace_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = stacktrace_trigger,
+ .count_func = event_trigger_count,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
};
static __init int register_trigger_stacktrace_cmd(void)
@@ -1642,24 +1630,24 @@ event_enable_trigger(struct event_trigger_data *data,
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
}
-static void
-event_enable_count_trigger(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *event)
+static bool
+event_enable_count_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
if (!data->count)
- return;
+ return false;
/* Skip if the event is in a state we want to switch to */
if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
- return;
+ return false;
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, buffer, rec, event);
+ return true;
}
int event_enable_trigger_print(struct seq_file *m,
@@ -1704,34 +1692,6 @@ void event_enable_trigger_free(struct event_trigger_data *data)
}
}
-static const struct event_trigger_ops event_enable_trigger_ops = {
- .trigger = event_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops event_enable_count_trigger_ops = {
- .trigger = event_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops event_disable_trigger_ops = {
- .trigger = event_enable_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
-static const struct event_trigger_ops event_disable_count_trigger_ops = {
- .trigger = event_enable_count_trigger,
- .print = event_enable_trigger_print,
- .init = event_trigger_init,
- .free = event_enable_trigger_free,
-};
-
int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob, char *cmd, char *param_and_filter)
@@ -1861,8 +1821,8 @@ int event_enable_register_trigger(char *glob,
}
}
- if (data->ops->init) {
- ret = data->ops->init(data);
+ if (data->cmd_ops->init) {
+ ret = data->cmd_ops->init(data);
if (ret < 0)
return ret;
}
@@ -1902,30 +1862,8 @@ void event_enable_unregister_trigger(char *glob,
}
}
- if (data && data->ops->free)
- data->ops->free(data);
-}
-
-static const struct event_trigger_ops *
-event_enable_get_trigger_ops(char *cmd, char *param)
-{
- const struct event_trigger_ops *ops;
- bool enable;
-
-#ifdef CONFIG_HIST_TRIGGERS
- enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
- (strcmp(cmd, ENABLE_HIST_STR) == 0));
-#else
- enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-#endif
- if (enable)
- ops = param ? &event_enable_count_trigger_ops :
- &event_enable_trigger_ops;
- else
- ops = param ? &event_disable_count_trigger_ops :
- &event_disable_trigger_ops;
-
- return ops;
+ if (data && data->cmd_ops->free)
+ data->cmd_ops->free(data);
}
static struct event_command trigger_enable_cmd = {
@@ -1934,8 +1872,12 @@ static struct event_command trigger_enable_cmd = {
.parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
- .get_trigger_ops = event_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_enable_trigger,
+ .count_func = event_enable_count_func,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static struct event_command trigger_disable_cmd = {
@@ -1944,8 +1886,12 @@ static struct event_command trigger_disable_cmd = {
.parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
- .get_trigger_ops = event_enable_get_trigger_ops,
.set_filter = set_trigger_filter,
+ .trigger = event_enable_trigger,
+ .count_func = event_enable_count_func,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
};
static __init void unregister_trigger_enable_disable_cmds(void)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b15854c75d4f..dca6e50b3b21 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type)
static int user_field_size(const char *type)
{
- /* long is not allowed from a user, since it's ambigious in size */
+ /* long is not allowed from a user, since it's ambiguous in size */
if (strcmp(type, "s64") == 0)
return sizeof(s64);
if (strcmp(type, "u64") == 0)
@@ -1079,7 +1079,7 @@ static int user_field_size(const char *type)
if (str_has_prefix(type, "__rel_loc "))
return sizeof(u32);
- /* Uknown basic type, error */
+ /* Unknown basic type, error */
return -EINVAL;
}
@@ -2465,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
/*
* Prevent users from using the same address and bit multiple times
* within the same mm address space. This can cause unexpected behavior
- * for user processes that is far easier to debug if this is explictly
+ * for user processes that is far easier to debug if this is explicitly
* an error upon registering.
*/
if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 8001dbf16891..262c0556e4af 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -632,7 +632,7 @@ print_fentry_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ip, flags))
goto out;
trace_seq_putc(s, ')');
@@ -662,12 +662,12 @@ print_fexit_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ret_ip, flags))
goto out;
trace_seq_puts(s, " <- ");
- if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_no_offset(s, field->func, flags))
goto out;
trace_seq_putc(s, ')');
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index d17c18934445..c12795c2fb39 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,11 +154,11 @@ static int function_trace_init(struct trace_array *tr)
if (!tr->ops)
return -ENOMEM;
- func = select_trace_function(func_flags.val);
+ func = select_trace_function(tr->current_trace_flags->val);
if (!func)
return -EINVAL;
- if (!handle_func_repeats(tr, func_flags.val))
+ if (!handle_func_repeats(tr, tr->current_trace_flags->val))
return -ENOMEM;
ftrace_init_array_ops(tr, func);
@@ -459,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
u32 new_flags;
/* Do nothing if already set. */
- if (!!set == !!(func_flags.val & bit))
+ if (!!set == !!(tr->current_trace_flags->val & bit))
return 0;
/* We can change this flag only when not running. */
if (tr->current_trace != &function_trace)
return 0;
- new_flags = (func_flags.val & ~bit) | (set ? bit : 0);
+ new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
func = select_trace_function(new_flags);
if (!func)
return -EINVAL;
@@ -491,7 +491,7 @@ static struct tracer function_trace __tracer_data =
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
- .flags = &func_flags,
+ .default_flags = &func_flags,
.set_flag = func_set_flag,
.allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7f4b9a47a71..b1e9c9913309 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -16,9 +16,12 @@
#include "trace.h"
#include "trace_output.h"
-/* When set, irq functions will be ignored */
+/* When set, irq functions might be ignored */
static int ftrace_graph_skip_irqs;
+/* Do not record function time when task is sleeping */
+int fgraph_no_sleep_time;
+
struct fgraph_cpu_data {
pid_t last_pid;
int depth;
@@ -33,14 +36,19 @@ struct fgraph_ent_args {
unsigned long args[FTRACE_REGS_MAX_ARGS];
};
+struct fgraph_retaddr_ent_args {
+ struct fgraph_retaddr_ent_entry ent;
+ /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+ unsigned long args[FTRACE_REGS_MAX_ARGS];
+};
+
struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
union {
struct fgraph_ent_args ent;
- /* TODO allow retaddr to have args */
- struct fgraph_retaddr_ent_entry rent;
+ struct fgraph_retaddr_ent_args rent;
};
struct ftrace_graph_ret_entry ret;
int failed;
@@ -85,11 +93,6 @@ static struct tracer_opt trace_opts[] = {
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
-#ifdef CONFIG_FUNCTION_PROFILER
- /* Include time within nested functions */
- { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
-#endif
-
{ } /* Empty entry */
};
@@ -97,13 +100,13 @@ static struct tracer_flags tracer_flags = {
/* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS |
- TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME,
+ TRACE_GRAPH_SLEEP_TIME,
.opts = trace_opts
};
-static bool tracer_flags_is_set(u32 flags)
+static bool tracer_flags_is_set(struct trace_array *tr, u32 flags)
{
- return (tracer_flags.val & flags) == flags;
+ return (tr->current_trace_flags->val & flags) == flags;
}
/*
@@ -162,20 +165,32 @@ int __trace_graph_entry(struct trace_array *tr,
int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr)
+ unsigned long retaddr,
+ struct ftrace_regs *fregs)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct fgraph_retaddr_ent_entry *entry;
+ int size;
+
+ /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+ size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
- sizeof(*entry), trace_ctx);
+ size, trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
- entry->graph_ent.func = trace->func;
- entry->graph_ent.depth = trace->depth;
- entry->graph_ent.retaddr = retaddr;
+ entry->graph_rent.ent = *trace;
+ entry->graph_rent.retaddr = retaddr;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
@@ -184,17 +199,21 @@ int __trace_graph_retaddr_entry(struct trace_array *tr,
int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx,
- unsigned long retaddr)
+ unsigned long retaddr,
+ struct ftrace_regs *fregs)
{
return 1;
}
#endif
-static inline int ftrace_graph_ignore_irqs(void)
+static inline int ftrace_graph_ignore_irqs(struct trace_array *tr)
{
if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
return 0;
+ if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ return 0;
+
return in_hardirq();
}
@@ -232,22 +251,20 @@ static int graph_entry(struct ftrace_graph_ent *trace,
return 1;
}
- if (!ftrace_trace_task(tr))
- return 0;
-
if (ftrace_graph_ignore_func(gops, trace))
return 0;
- if (ftrace_graph_ignore_irqs())
+ if (ftrace_graph_ignore_irqs(tr))
return 0;
- if (fgraph_sleep_time) {
- /* Only need to record the calltime */
- ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
- } else {
+ if (fgraph_no_sleep_time &&
+ !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) {
ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
if (ftimes)
ftimes->sleeptime = current->ftrace_sleeptime;
+ } else {
+ /* Only need to record the calltime */
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
}
if (!ftimes)
return 0;
@@ -263,9 +280,10 @@ static int graph_entry(struct ftrace_graph_ent *trace,
trace_ctx = tracing_gen_ctx();
if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
- tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
+ tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) {
unsigned long retaddr = ftrace_graph_top_ret_addr(current);
- ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx,
+ retaddr, fregs);
} else {
ret = __graph_entry(tr, trace, trace_ctx, fregs);
}
@@ -333,11 +351,15 @@ void __trace_graph_return(struct trace_array *tr,
trace_buffer_unlock_commit_nostack(buffer, event);
}
-static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+static void handle_nosleeptime(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
struct fgraph_times *ftimes,
int size)
{
- if (fgraph_sleep_time || size < sizeof(*ftimes))
+ if (size < sizeof(*ftimes))
+ return;
+
+ if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
return;
ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
@@ -366,7 +388,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
if (!ftimes)
return;
- handle_nosleeptime(trace, ftimes, size);
+ handle_nosleeptime(tr, trace, ftimes, size);
calltime = ftimes->calltime;
@@ -379,6 +401,7 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
struct ftrace_regs *fregs)
{
struct fgraph_times *ftimes;
+ struct trace_array *tr;
int size;
ftrace_graph_addr_finish(gops, trace);
@@ -392,7 +415,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
if (!ftimes)
return;
- handle_nosleeptime(trace, ftimes, size);
+ tr = gops->private;
+ handle_nosleeptime(tr, trace, ftimes, size);
if (tracing_thresh &&
(trace_clock_local() - ftimes->calltime < tracing_thresh))
@@ -441,7 +465,7 @@ static int graph_trace_init(struct trace_array *tr)
{
int ret;
- if (tracer_flags_is_set(TRACE_GRAPH_ARGS))
+ if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS))
tr->gops->entryfunc = trace_graph_entry_args;
else
tr->gops->entryfunc = trace_graph_entry;
@@ -451,6 +475,12 @@ static int graph_trace_init(struct trace_array *tr)
else
tr->gops->retfunc = trace_graph_return;
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ ftrace_graph_skip_irqs++;
+
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
+ fgraph_no_sleep_time++;
+
/* Make gops functions visible before we start tracing */
smp_mb();
@@ -468,10 +498,6 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set)
{
trace_func_graph_ent_t entry;
- /* Do nothing if the current tracer is not this tracer */
- if (tr->current_trace != &graph_trace)
- return 0;
-
if (set)
entry = trace_graph_entry_args;
else
@@ -492,6 +518,16 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set)
static void graph_trace_reset(struct trace_array *tr)
{
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS))
+ ftrace_graph_skip_irqs--;
+ if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0))
+ ftrace_graph_skip_irqs = 0;
+
+ if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME))
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+
tracing_stop_cmdline_record();
unregister_ftrace_graph(tr->gops);
}
@@ -634,13 +670,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) {
- data->rent = *(struct fgraph_retaddr_ent_entry *)curr;
- } else {
- int size = min((int)sizeof(data->ent), (int)iter->ent_size);
+ int size = min_t(int, sizeof(data->rent), iter->ent_size);
- memcpy(&data->ent, curr, size);
- }
+ memcpy(&data->rent, curr, size);
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
@@ -703,7 +735,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
addr >= (unsigned long)__irqentry_text_end)
return;
- if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) {
/* Absolute time */
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
@@ -723,7 +755,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* Latency format */
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
print_graph_lat_fmt(s, ent);
}
@@ -777,7 +809,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags)
{
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
- !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
/* No real adata, just filling the column with spaces */
@@ -818,7 +850,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e
trace_seq_puts(s, " /*");
trace_seq_puts(s, " <-");
- seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+ seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags);
if (comment)
trace_seq_puts(s, " */");
@@ -964,7 +996,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
trace_seq_printf(s, "%ps", (void *)ret_func);
if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
- print_function_args(s, entry->args, ret_func);
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func);
trace_seq_putc(s, ';');
} else
trace_seq_puts(s, "();");
@@ -1016,7 +1048,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
- print_function_args(s, entry->args, func);
+ print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func);
else
trace_seq_puts(s, "()");
@@ -1054,7 +1086,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
/* Interrupt */
print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
/* Absolute time */
@@ -1076,7 +1108,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
}
/* Latency format */
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT))
print_graph_lat_fmt(s, ent);
return;
@@ -1198,11 +1230,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
/*
* print_graph_entry() may consume the current event,
* thus @field may become invalid, so we need to save it.
- * sizeof(struct ftrace_graph_ent_entry) is very small,
- * it can be safely saved at the stack.
+ * This function is shared by ftrace_graph_ent_entry and
+ * fgraph_retaddr_ent_entry, the size of the latter one
+ * is larger, but it is very small and can be safely saved
+ * at the stack.
*/
struct ftrace_graph_ent_entry *entry;
- u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
+ struct fgraph_retaddr_ent_entry *rentry;
+ u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
/* The ent_size is expected to be as big as the entry */
if (iter->ent_size > sizeof(save_buf))
@@ -1431,12 +1466,17 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
}
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
case TRACE_GRAPH_RETADDR_ENT: {
- struct fgraph_retaddr_ent_entry saved;
+ /*
+ * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have
+ * similar functions and memory layouts. The only difference
+ * is that the latter one has an extra retaddr member, so
+ * they can share most of the logic.
+ */
struct fgraph_retaddr_ent_entry *rfield;
trace_assign_type(rfield, entry);
- saved = *rfield;
- return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+ return print_graph_entry((struct ftrace_graph_ent_entry *)rfield,
+ s, iter, flags);
}
#endif
case TRACE_GRAPH_RET: {
@@ -1459,7 +1499,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
static enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
- return print_graph_function_flags(iter, tracer_flags.val);
+ struct trace_array *tr = iter->tr;
+ return print_graph_function_flags(iter, tr->current_trace_flags->val);
}
static enum print_line_t
@@ -1495,7 +1536,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
static void __print_graph_headers_flags(struct trace_array *tr,
struct seq_file *s, u32 flags)
{
- int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT;
+ int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT);
if (lat)
print_lat_header(s, flags);
@@ -1535,7 +1576,10 @@ static void __print_graph_headers_flags(struct trace_array *tr,
static void print_graph_headers(struct seq_file *s)
{
- print_graph_headers_flags(s, tracer_flags.val);
+ struct trace_iterator *iter = s->private;
+ struct trace_array *tr = iter->tr;
+
+ print_graph_headers_flags(s, tr->current_trace_flags->val);
}
void print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1543,10 +1587,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
- if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO)))
return;
- if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) {
+ if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return;
@@ -1613,17 +1657,56 @@ void graph_trace_close(struct trace_iterator *iter)
static int
func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
- if (bit == TRACE_GRAPH_PRINT_IRQS)
- ftrace_graph_skip_irqs = !set;
+/*
+ * The function profiler gets updated even if function graph
+ * isn't the current tracer. Handle it separately.
+ */
+#ifdef CONFIG_FUNCTION_PROFILER
+ if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) &&
+ !!set == fprofile_no_sleep_time) {
+ if (set) {
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+ fprofile_no_sleep_time = false;
+ } else {
+ fgraph_no_sleep_time++;
+ fprofile_no_sleep_time = true;
+ }
+ }
+#endif
+
+ /* Do nothing if the current tracer is not this tracer */
+ if (tr->current_trace != &graph_trace)
+ return 0;
- if (bit == TRACE_GRAPH_SLEEP_TIME)
- ftrace_graph_sleep_time_control(set);
+ /* Do nothing if already set. */
+ if (!!set == !!(tr->current_trace_flags->val & bit))
+ return 0;
+
+ switch (bit) {
+ case TRACE_GRAPH_SLEEP_TIME:
+ if (set) {
+ fgraph_no_sleep_time--;
+ if (WARN_ON_ONCE(fgraph_no_sleep_time < 0))
+ fgraph_no_sleep_time = 0;
+ } else {
+ fgraph_no_sleep_time++;
+ }
+ break;
- if (bit == TRACE_GRAPH_GRAPH_TIME)
- ftrace_graph_graph_time_control(set);
+ case TRACE_GRAPH_PRINT_IRQS:
+ if (set)
+ ftrace_graph_skip_irqs--;
+ else
+ ftrace_graph_skip_irqs++;
+ if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0))
+ ftrace_graph_skip_irqs = 0;
+ break;
- if (bit == TRACE_GRAPH_ARGS)
+ case TRACE_GRAPH_ARGS:
return ftrace_graph_trace_args(tr, set);
+ }
return 0;
}
@@ -1660,7 +1743,7 @@ static struct tracer graph_trace __tracer_data = {
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
- .flags = &tracer_flags,
+ .default_flags = &tracer_flags,
.set_flag = func_graph_set_flag,
.allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 4c45c49b06c8..17673905907c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int irqsoff_display_graph(struct trace_array *tr, int set);
-# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH))
#else
static inline int irqsoff_display_graph(struct trace_array *tr, int set)
{
@@ -485,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
- /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+ /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION))))
return 0;
if (graph)
@@ -515,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
{
- if (!(mask & TRACE_ITER_FUNCTION))
+ if (!(mask & TRACE_ITER(FUNCTION)))
return 0;
if (set)
@@ -536,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set
}
#endif /* CONFIG_FUNCTION_TRACER */
-static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
+static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set)
{
struct tracer *tracer = tr->current_trace;
@@ -544,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
return 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ if (mask & TRACE_ITER(DISPLAY_GRAPH))
return irqsoff_display_graph(tr, set);
#endif
@@ -582,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1);
/* without pause, we will produce garbage if another latency occurs */
- set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1);
+ set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1);
tr->max_latency = 0;
irqsoff_trace = tr;
@@ -605,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
static void __irqsoff_tracer_reset(struct trace_array *tr)
{
- int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
- int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
- int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE;
+ int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT);
+ int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE);
+ int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE);
stop_irqsoff_tracer(tr, is_graph(tr));
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
- set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag);
+ set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag);
ftrace_reset_array_ops(tr);
irqsoff_busy = false;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 896ff78b8349..b30795f34079 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
old_userobj = tr->trace_flags;
/* don't look at user memory in panic mode */
- tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ);
kdb_printf("Dumping ftrace buffer:\n");
if (skip_entries)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ee8171b19bee..9953506370a5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1584,7 +1584,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ip, flags))
goto out;
trace_seq_putc(s, ')');
@@ -1614,12 +1614,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_offset(s, field->ret_ip, flags))
goto out;
trace_seq_puts(s, " <- ");
- if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ if (!seq_print_ip_sym_no_offset(s, field->func, flags))
goto out;
trace_seq_putc(s, ')');
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a9962d4497e8..827104d00bc0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -329,7 +329,7 @@ static struct osnoise_data {
u64 print_stack; /* print IRQ stack if total > */
int timerlat_tracer; /* timerlat tracer */
#endif
- bool tainted; /* infor users and developers about a problem */
+ bool tainted; /* info users and developers about a problem */
} osnoise_data = {
.sample_period = DEFAULT_SAMPLE_PERIOD,
.sample_runtime = DEFAULT_SAMPLE_RUNTIME,
@@ -738,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
/*
* get_int_safe_duration - Get the duration of a window
*
- * The irq, softirq and thread varaibles need to have its duration without
+ * The irq, softirq and thread variables need to have its duration without
* the interference from higher priority interrupts. Instead of keeping a
* variable to discount the interrupt interference from these variables, the
* starting time of these variables are pushed forward with the interrupt's
@@ -1460,7 +1460,7 @@ static int run_osnoise(void)
stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
/*
- * Start timestemp
+ * Start timestamp
*/
start = time_get();
@@ -1881,7 +1881,7 @@ static int timerlat_main(void *data)
tlat->kthread = current;
osn_var->pid = current->pid;
/*
- * Anotate the arrival time.
+ * Annotate the arrival time.
*/
tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
@@ -1978,7 +1978,7 @@ static void stop_per_cpu_kthreads(void)
}
/*
- * start_kthread - Start a workload tread
+ * start_kthread - Start a workload thread
*/
static int start_kthread(unsigned int cpu)
{
@@ -2705,7 +2705,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
* Why not using tracing instance per_cpu/ dir?
*
* Because osnoise/timerlat have a single workload, having
- * multiple files like these are wast of memory.
+ * multiple files like these are waste of memory.
*/
per_cpu = tracefs_create_dir("per_cpu", top_dir);
if (!per_cpu)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 97db0b0ccf3e..cc2d3306bb60 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -420,7 +420,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
}
mmap_read_unlock(mm);
}
- if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file))
trace_seq_printf(s, " <" IP_FMT ">", ip);
return !trace_seq_has_overflowed(s);
}
@@ -433,9 +433,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
goto out;
}
- trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+ trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET));
- if (sym_flags & TRACE_ITER_SYM_ADDR)
+ if (sym_flags & TRACE_ITER(SYM_ADDR))
trace_seq_printf(s, " <" IP_FMT ">", ip);
out:
@@ -569,7 +569,7 @@ static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
struct trace_array *tr = iter->tr;
- unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE;
+ unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE);
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
@@ -636,7 +636,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
- if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
unsigned int tgid = trace_find_tgid(entry->pid);
if (!tgid)
@@ -647,7 +647,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "[%03d] ", iter->cpu);
- if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
+ if (tr->trace_flags & TRACE_ITER(IRQ_INFO))
trace_print_lat_fmt(s, entry);
trace_print_time(s, iter, iter->ts);
@@ -661,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
struct trace_entry *entry, *next_entry;
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE);
+ unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE));
u64 next_ts;
next_entry = trace_find_next_entry(iter, NULL, &next_ts);
@@ -950,7 +950,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
int offset;
int len;
int ret;
+ int i;
void *pos;
+ char *str;
list_for_each_entry_reverse(field, head, link) {
trace_seq_printf(&iter->seq, " %s=", field->name);
@@ -977,8 +979,29 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
trace_seq_puts(&iter->seq, "<OVERFLOW>");
break;
}
- pos = (void *)iter->ent + offset;
- trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos);
+ str = (char *)iter->ent + offset;
+ /* Check if there's any non printable strings */
+ for (i = 0; i < len; i++) {
+ if (str[i] && !(isascii(str[i]) && isprint(str[i])))
+ break;
+ }
+ if (i < len) {
+ for (i = 0; i < len; i++) {
+ if (isascii(str[i]) && isprint(str[i]))
+ trace_seq_putc(&iter->seq, str[i]);
+ else
+ trace_seq_putc(&iter->seq, '.');
+ }
+ trace_seq_puts(&iter->seq, " (");
+ for (i = 0; i < len; i++) {
+ if (i)
+ trace_seq_putc(&iter->seq, ':');
+ trace_seq_printf(&iter->seq, "%02x", str[i]);
+ }
+ trace_seq_putc(&iter->seq, ')');
+ } else {
+ trace_seq_printf(&iter->seq, "%.*s", len, str);
+ }
break;
case FILTER_PTR_STRING:
if (!iter->fmt_size)
@@ -1127,7 +1150,7 @@ static void print_fn_trace(struct trace_seq *s, unsigned long ip,
if (args)
print_function_args(s, args, ip);
- if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
+ if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) {
trace_seq_puts(s, " <-");
seq_print_ip_sym(s, parent_ip, flags);
}
@@ -1417,7 +1440,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "<user stack trace>\n");
- if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) {
struct task_struct *task;
/*
* we do the lookup on the thread group leader,
@@ -1467,12 +1490,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d",
+ trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d",
field->seqnum,
field->duration,
field->outer_duration,
- (long long)field->timestamp.tv_sec,
- field->timestamp.tv_nsec, field->count);
+ &field->timestamp,
+ field->count);
if (field->nmi_count) {
/*
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2e305364f2a9..99b676733d46 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,17 @@ extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
+static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags)
+{
+ return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET));
+}
+static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags)
+{
+ return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET));
+}
+
extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5cbdc423afeb..2f571083ce9e 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -156,7 +156,7 @@ fail:
static struct trace_probe_log trace_probe_log;
extern struct mutex dyn_event_ops_mutex;
-void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
+const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
lockdep_assert_held(&dyn_event_ops_mutex);
@@ -164,6 +164,7 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
trace_probe_log.index = 0;
+ return subsystem;
}
void trace_probe_log_clear(void)
@@ -214,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type)
p = command;
for (i = 0; i < trace_probe_log.argc; i++) {
len = strlen(trace_probe_log.argv[i]);
- strcpy(p, trace_probe_log.argv[i]);
+ memcpy(p, trace_probe_log.argv[i], len);
p[len] = ' ';
p += len + 1;
}
@@ -516,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
}
}
-/* Return 1 if the field separater is arrow operator ('->') */
+/* Return 1 if the field separator is arrow operator ('->') */
static int split_next_field(char *varname, char **next_field,
struct traceprobe_parse_context *ctx)
{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 08b5bda24da2..9fc56c937130 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -578,11 +578,13 @@ struct trace_probe_log {
int index;
};
-void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
+const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv);
void trace_probe_log_set_index(int index);
void trace_probe_log_clear(void);
void __trace_probe_log_err(int offset, int err);
+DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear())
+
#define trace_probe_log_err(offs, err) \
__trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e3f2e4f56faa..8faa73d3bba1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph);
static int save_flags;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH))
#else
# define is_graph(tr) false
#endif
@@ -247,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
- /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+ /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION))))
return 0;
if (graph)
@@ -277,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph)
static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
{
- if (!(mask & TRACE_ITER_FUNCTION))
+ if (!(mask & TRACE_ITER(FUNCTION)))
return 0;
if (set)
@@ -324,7 +324,7 @@ __trace_function(struct trace_array *tr,
trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
-static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
+static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set)
{
struct tracer *tracer = tr->current_trace;
@@ -332,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
return 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ if (mask & TRACE_ITER(DISPLAY_GRAPH))
return wakeup_display_graph(tr, set);
#endif
@@ -681,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)
save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1);
tr->max_latency = 0;
wakeup_trace = tr;
@@ -725,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
- int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
- int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+ int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT);
+ int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE);
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
- set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag);
+ set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag);
ftrace_reset_array_ops(tr);
wakeup_busy = false;
}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index c158d65a8a88..32684ef4fb9d 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -15,7 +15,7 @@
*
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
- * the buffer but it wont update the pointers). This allows users to
+ * the buffer but it won't update the pointers). This allows users to
* try to write something into the trace_seq buffer and if it fails
* they can flush it and try again.
*
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0f932b22f9ec..e96d0063cbcf 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
+#include <linux/kernel_stat.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -123,6 +124,119 @@ const char *get_syscall_name(int syscall)
return entry->name;
}
+/* Added to user strings or arrays when max limit is reached */
+#define EXTRA "..."
+
+static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
+ struct syscall_metadata *entry,
+ int *offset_p, int *len_p, unsigned char **ptr_p)
+{
+ unsigned char *ptr;
+ int offset = *offset_p;
+ int val;
+
+ /* This arg points to a user space string */
+ ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
+ val = *(int *)ptr;
+
+ /* The value is a dynamic string (len << 16 | offset) */
+ ptr = (void *)trace + (val & 0xffff);
+ *len_p = val >> 16;
+ offset += 4;
+
+ *ptr_p = ptr;
+ *offset_p = offset;
+}
+
+static enum print_line_t
+sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
+ struct trace_seq *s, struct trace_event *event)
+{
+ unsigned char *ptr;
+ int offset = 0;
+ int bits, len;
+ bool done = false;
+ static const struct trace_print_flags __flags[] =
+ {
+ { O_TMPFILE, "O_TMPFILE" },
+ { O_WRONLY, "O_WRONLY" },
+ { O_RDWR, "O_RDWR" },
+ { O_CREAT, "O_CREAT" },
+ { O_EXCL, "O_EXCL" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_APPEND, "O_APPEND" },
+ { O_NONBLOCK, "O_NONBLOCK" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_LARGEFILE, "O_LARGEFILE" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_NOATIME, "O_NOATIME" },
+ { O_CLOEXEC, "O_CLOEXEC" },
+ { -1, NULL }
+ };
+
+ trace_seq_printf(s, "%s(", entry->name);
+
+ for (int i = 0; !done && i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ if (i)
+ trace_seq_puts(s, ", ");
+
+ switch (i) {
+ case 2:
+ bits = trace->args[2];
+
+ trace_seq_puts(s, "flags: ");
+
+ /* No need to show mode when not creating the file */
+ if (!(bits & (O_CREAT|O_TMPFILE)))
+ done = true;
+
+ if (!(bits & O_ACCMODE)) {
+ if (!bits) {
+ trace_seq_puts(s, "O_RDONLY");
+ continue;
+ }
+ trace_seq_puts(s, "O_RDONLY|");
+ }
+
+ trace_print_flags_seq(s, "|", bits, __flags);
+ /*
+ * trace_print_flags_seq() adds a '\0' to the
+ * buffer, but this needs to append more to the seq.
+ */
+ if (!trace_seq_has_overflowed(s))
+ trace_seq_pop(s);
+
+ continue;
+ case 3:
+ trace_seq_printf(s, "%s: 0%03o", entry->args[i],
+ (unsigned int)trace->args[i]);
+ continue;
+ }
+
+ trace_seq_printf(s, "%s: %lu", entry->args[i],
+ trace->args[i]);
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
+ trace_seq_printf(s, " \"%.*s\"", len, ptr);
+ }
+
+ trace_seq_putc(s, ')');
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -132,7 +246,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, syscall;
+ int i, syscall, val, len;
+ unsigned char *ptr;
+ int offset = 0;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -146,9 +262,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
+ switch (entry->syscall_nr) {
+ case __NR_openat:
+ if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
+ return sys_enter_openat_print(trace, entry, s, event);
+ break;
+ default:
+ break;
+ }
+
trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+ bool printable = false;
+ char *str;
if (trace_seq_has_overflowed(s))
goto end;
@@ -157,7 +284,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
trace_seq_puts(s, ", ");
/* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
@@ -167,6 +294,48 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
else
trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
trace->args[i]);
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
+
+ if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
+ trace_seq_printf(s, " \"%.*s\"", len, ptr);
+ continue;
+ }
+
+ val = trace->args[entry->user_arg_size];
+
+ str = ptr;
+ trace_seq_puts(s, " (");
+ for (int x = 0; x < len; x++, ptr++) {
+ if (isascii(*ptr) && isprint(*ptr))
+ printable = true;
+ if (x)
+ trace_seq_putc(s, ':');
+ trace_seq_printf(s, "%02x", *ptr);
+ }
+ if (len < val)
+ trace_seq_printf(s, ", %s", EXTRA);
+
+ trace_seq_putc(s, ')');
+
+ /* If nothing is printable, don't bother printing anything */
+ if (!printable)
+ continue;
+
+ trace_seq_puts(s, " \"");
+ for (int x = 0; x < len; x++) {
+ if (isascii(str[x]) && isprint(str[x]))
+ trace_seq_putc(s, str[x]);
+ else
+ trace_seq_putc(s, '.');
+ }
+ if (len < val)
+ trace_seq_printf(s, "\"%s", EXTRA);
+ else
+ trace_seq_putc(s, '"');
}
trace_seq_putc(s, ')');
@@ -212,26 +381,107 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
.size = sizeof(_type), .align = __alignof__(_type), \
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
+/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+static int __init
+sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+{
+ int pos = 0;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->dfd)),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->filename)),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " __get_str(__filename_val),");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " REC->flags ? __print_flags(REC->flags, \"|\", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ " ((unsigned long)(REC->mode))");
+ return pos;
+}
+
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
+ bool is_string = entry->user_arg_is_str;
int i;
int pos = 0;
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
+ switch (entry->syscall_nr) {
+ case __NR_openat:
+ return sys_enter_openat_print_fmt(entry, buf, len);
+ default:
+ break;
+ }
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
- entry->args[i], sizeof(unsigned long),
- i == entry->nb_args - 1 ? "" : ", ");
+ if (i)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
+ entry->args[i], sizeof(unsigned long));
+
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+
+ /* Add the format for the user space string or array */
+ if (entry->user_arg_size < 0 || is_string)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
pos += snprintf(buf + pos, LEN_OR_ZERO,
", ((unsigned long)(REC->%s))", entry->args[i]);
+ if (!(BIT(i) & entry->user_mask))
+ continue;
+ /* The user space data for arg has name __<arg>_val */
+ if (entry->user_arg_size < 0 || is_string) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
+ entry->args[i]);
+ } else {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
+ entry->args[i]);
+ }
}
#undef LEN_OR_ZERO
@@ -277,8 +527,11 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
+ unsigned long mask;
+ char *arg;
int offset = offsetof(typeof(trace), args);
int ret = 0;
+ int len;
int i;
for (i = 0; i < meta->nb_args; i++) {
@@ -291,9 +544,320 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
offset += sizeof(unsigned long);
}
+ if (ret || !meta->user_mask)
+ return ret;
+
+ mask = meta->user_mask;
+
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ /*
+ * User space data is faulted into a temporary buffer and then
+ * added as a dynamic string or array to the end of the event.
+ * The user space data name for the arg pointer is
+ * "__<arg>_val".
+ */
+ len = strlen(meta->args[idx]) + sizeof("___val");
+ arg = kmalloc(len, GFP_KERNEL);
+ if (WARN_ON_ONCE(!arg)) {
+ meta->user_mask = 0;
+ return -ENOMEM;
+ }
+
+ snprintf(arg, len, "__%s_val", meta->args[idx]);
+
+ ret = trace_define_field(call, "__data_loc char[]",
+ arg, offset, sizeof(int), 0,
+ FILTER_OTHER);
+ if (ret) {
+ kfree(arg);
+ break;
+ }
+ offset += 4;
+ }
return ret;
}
+/*
+ * Create a per CPU temporary buffer to copy user space pointers into.
+ *
+ * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
+ * (defined in kernel/trace/trace.h)
+
+ * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
+ * nul terminating byte and possibly appended EXTRA (4 bytes).
+ *
+ * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
+ * to copy memory from user space addresses into that will hold
+ * 3 args as only 3 args are allowed to be copied from system calls.
+ */
+#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
+#define SYSCALL_FAULT_MAX_CNT 3
+#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
+
+/* Use the tracing per CPU buffer infrastructure to copy from user space */
+struct syscall_user_buffer {
+ struct trace_user_buf_info buf;
+ struct rcu_head rcu;
+};
+
+static struct syscall_user_buffer *syscall_buffer;
+
+static int syscall_fault_buffer_enable(void)
+{
+ struct syscall_user_buffer *sbuf;
+ int ret;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (syscall_buffer) {
+ trace_user_fault_get(&syscall_buffer->buf);
+ return 0;
+ }
+
+ sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
+ if (!sbuf)
+ return -ENOMEM;
+
+ ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
+ if (ret < 0) {
+ kfree(sbuf);
+ return ret;
+ }
+
+ WRITE_ONCE(syscall_buffer, sbuf);
+
+ return 0;
+}
+
+static void rcu_free_syscall_buffer(struct rcu_head *rcu)
+{
+ struct syscall_user_buffer *sbuf =
+ container_of(rcu, struct syscall_user_buffer, rcu);
+
+ trace_user_fault_destroy(&sbuf->buf);
+ kfree(sbuf);
+}
+
+
+static void syscall_fault_buffer_disable(void)
+{
+ struct syscall_user_buffer *sbuf = syscall_buffer;
+
+ lockdep_assert_held(&syscall_trace_lock);
+
+ if (trace_user_fault_put(&sbuf->buf))
+ return;
+
+ WRITE_ONCE(syscall_buffer, NULL);
+ call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
+}
+
+struct syscall_args {
+ char *ptr_array[SYSCALL_FAULT_MAX_CNT];
+ int read[SYSCALL_FAULT_MAX_CNT];
+ int uargs;
+};
+
+static int syscall_copy_user(char *buf, const char __user *ptr,
+ size_t size, void *data)
+{
+ struct syscall_args *args = data;
+ int ret;
+
+ for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ ptr = (char __user *)args->ptr_array[i];
+ ret = strncpy_from_user(buf, ptr, size);
+ args->read[i] = ret;
+ }
+ return 0;
+}
+
+static int syscall_copy_user_array(char *buf, const char __user *ptr,
+ size_t size, void *data)
+{
+ struct syscall_args *args = data;
+ int ret;
+
+ for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ ptr = (char __user *)args->ptr_array[i];
+ ret = __copy_from_user(buf, ptr, size);
+ args->read[i] = ret ? -1 : size;
+ }
+ return 0;
+}
+
+static char *sys_fault_user(unsigned int buf_size,
+ struct syscall_metadata *sys_data,
+ struct syscall_user_buffer *sbuf,
+ unsigned long *args,
+ unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
+{
+ trace_user_buf_copy syscall_copy = syscall_copy_user;
+ unsigned long mask = sys_data->user_mask;
+ unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
+ struct syscall_args sargs;
+ bool array = false;
+ char *buffer;
+ char *buf;
+ int ret;
+ int i = 0;
+
+ /* The extra is appended to the user data in the buffer */
+ BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
+ SYSCALL_FAULT_ARG_SZ);
+
+ /*
+ * If this system call event has a size argument, use
+ * it to define how much of user space memory to read,
+ * and read it as an array and not a string.
+ */
+ if (sys_data->user_arg_size >= 0) {
+ array = true;
+ size = args[sys_data->user_arg_size];
+ if (size > SYSCALL_FAULT_ARG_SZ - 1)
+ size = SYSCALL_FAULT_ARG_SZ - 1;
+ syscall_copy = syscall_copy_user_array;
+ }
+
+ while (mask) {
+ int idx = ffs(mask) - 1;
+ mask &= ~BIT(idx);
+
+ if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
+ break;
+
+ /* Get the pointer to user space memory to read */
+ sargs.ptr_array[i++] = (char *)args[idx];
+ }
+
+ sargs.uargs = i;
+
+ /* Clear the values that are not used */
+ for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ data_size[i] = -1; /* Denotes no pointer */
+ }
+
+ /* A zero size means do not even try */
+ if (!buf_size)
+ return NULL;
+
+ buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
+ syscall_copy, &sargs);
+ if (!buffer)
+ return NULL;
+
+ buf = buffer;
+ for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+
+ ret = sargs.read[i];
+ if (ret < 0)
+ continue;
+ buf[ret] = '\0';
+
+ /* For strings, replace any non-printable characters with '.' */
+ if (!array) {
+ for (int x = 0; x < ret; x++) {
+ if (!isprint(buf[x]))
+ buf[x] = '.';
+ }
+
+ size = min(buf_size, SYSCALL_FAULT_USER_MAX);
+
+ /*
+ * If the text was truncated due to our max limit,
+ * add "..." to the string.
+ */
+ if (ret > size) {
+ strscpy(buf + size, EXTRA, sizeof(EXTRA));
+ ret = size + sizeof(EXTRA);
+ } else {
+ buf[ret++] = '\0';
+ }
+ } else {
+ ret = min((unsigned int)ret, buf_size);
+ }
+ data_size[i] = ret;
+ }
+
+ return buffer;
+}
+
+static int
+syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
+ char **buffer, int *size, int *user_sizes, int *uargs,
+ int buf_size)
+{
+ struct syscall_user_buffer *sbuf;
+ int i;
+
+ /* If the syscall_buffer is NULL, tracing is being shutdown */
+ sbuf = READ_ONCE(syscall_buffer);
+ if (!sbuf)
+ return -1;
+
+ *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
+ /*
+ * user_size is the amount of data to append.
+ * Need to add 4 for the meta field that points to
+ * the user memory at the end of the event and also
+ * stores its size.
+ */
+ for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
+ if (user_sizes[i] < 0)
+ break;
+ *size += user_sizes[i] + 4;
+ }
+ /* Save the number of user read arguments of this syscall */
+ *uargs = i;
+ return 0;
+}
+
+static void syscall_put_data(struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *entry,
+ char *buffer, int size, int *user_sizes, int uargs)
+{
+ char *buf = buffer;
+ void *ptr;
+ int val;
+
+ /*
+ * Set the pointer to point to the meta data of the event
+ * that has information about the stored user space memory.
+ */
+ ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+ /*
+ * The meta data will store the offset of the user data from
+ * the beginning of the event. That is after the static arguments
+ * and the meta data fields.
+ */
+ val = (ptr - (void *)entry) + 4 * uargs;
+
+ for (int i = 0; i < uargs; i++) {
+
+ if (i)
+ val += user_sizes[i - 1];
+
+ /* Store the offset and the size into the meta data */
+ *(int *)ptr = val | (user_sizes[i] << 16);
+
+ /* Skip the meta data */
+ ptr += 4;
+ }
+
+ for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
+ /* Nothing to do if the user space was empty or faulted */
+ if (!user_sizes[i])
+ continue;
+
+ memcpy(ptr, buf, user_sizes[i]);
+ ptr += user_sizes[i];
+ }
+}
+
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -302,15 +866,18 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct trace_event_buffer fbuffer;
unsigned long args[6];
+ char *user_ptr;
+ int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
int syscall_nr;
- int size;
+ int size = 0;
+ int uargs = 0;
+ bool mayfault;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -327,7 +894,20 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (!sys_data)
return;
- size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ /* Check if this syscall event faults in user space memory */
+ mayfault = sys_data->user_mask != 0;
+
+ guard(preempt_notrace)();
+
+ syscall_get_arguments(current, regs, args);
+
+ if (mayfault) {
+ if (syscall_get_data(sys_data, args, &user_ptr,
+ &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
+ return;
+ }
+
+ size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
@@ -335,9 +915,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, args);
+
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
+ if (mayfault)
+ syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
+
trace_event_buffer_commit(&fbuffer);
}
@@ -386,39 +969,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
static int reg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int ret = 0;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
- mutex_lock(&syscall_trace_lock);
- if (!tr->sys_refcount_enter)
+ guard(mutex)(&syscall_trace_lock);
+ if (sys_data->user_mask) {
+ ret = syscall_fault_buffer_enable();
+ if (ret < 0)
+ return ret;
+ }
+ if (!tr->sys_refcount_enter) {
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
- if (!ret) {
- WRITE_ONCE(tr->enter_syscall_files[num], file);
- tr->sys_refcount_enter++;
+ if (ret < 0) {
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ WRITE_ONCE(tr->enter_syscall_files[num], file);
+ tr->sys_refcount_enter++;
+ return 0;
}
static void unreg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
tr->sys_refcount_enter--;
WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
- mutex_unlock(&syscall_trace_lock);
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
}
static int reg_event_syscall_exit(struct trace_event_file *file,
@@ -459,6 +1053,215 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
+/*
+ * For system calls that reference user space memory that can
+ * be recorded into the event, set the system call meta data's user_mask
+ * to the "args" index that points to the user space memory to retrieve.
+ */
+static void check_faultable_syscall(struct trace_event_call *call, int nr)
+{
+ struct syscall_metadata *sys_data = call->data;
+ unsigned long mask;
+
+ /* Only work on entry */
+ if (sys_data->enter_event != call)
+ return;
+
+ sys_data->user_arg_size = -1;
+
+ switch (nr) {
+ /* user arg 1 with size arg at 2 */
+ case __NR_write:
+#ifdef __NR_mq_timedsend
+ case __NR_mq_timedsend:
+#endif
+ case __NR_pwrite64:
+ sys_data->user_mask = BIT(1);
+ sys_data->user_arg_size = 2;
+ break;
+ /* user arg 0 with size arg at 1 as string */
+ case __NR_setdomainname:
+ case __NR_sethostname:
+ sys_data->user_mask = BIT(0);
+ sys_data->user_arg_size = 1;
+ sys_data->user_arg_is_str = 1;
+ break;
+#ifdef __NR_kexec_file_load
+ /* user arg 4 with size arg at 3 as string */
+ case __NR_kexec_file_load:
+ sys_data->user_mask = BIT(4);
+ sys_data->user_arg_size = 3;
+ sys_data->user_arg_is_str = 1;
+ break;
+#endif
+ /* user arg at position 0 */
+#ifdef __NR_access
+ case __NR_access:
+#endif
+ case __NR_acct:
+ case __NR_chdir:
+#ifdef __NR_chown
+ case __NR_chown:
+#endif
+#ifdef __NR_chmod
+ case __NR_chmod:
+#endif
+ case __NR_chroot:
+#ifdef __NR_creat
+ case __NR_creat:
+#endif
+ case __NR_delete_module:
+ case __NR_execve:
+ case __NR_fsopen:
+#ifdef __NR_lchown
+ case __NR_lchown:
+#endif
+#ifdef __NR_open
+ case __NR_open:
+#endif
+ case __NR_memfd_create:
+#ifdef __NR_mkdir
+ case __NR_mkdir:
+#endif
+#ifdef __NR_mknod
+ case __NR_mknod:
+#endif
+ case __NR_mq_open:
+ case __NR_mq_unlink:
+#ifdef __NR_readlink
+ case __NR_readlink:
+#endif
+#ifdef __NR_rmdir
+ case __NR_rmdir:
+#endif
+ case __NR_shmdt:
+#ifdef __NR_statfs
+ case __NR_statfs:
+#endif
+ case __NR_swapon:
+ case __NR_swapoff:
+#ifdef __NR_truncate
+ case __NR_truncate:
+#endif
+#ifdef __NR_unlink
+ case __NR_unlink:
+#endif
+ case __NR_umount2:
+#ifdef __NR_utime
+ case __NR_utime:
+#endif
+#ifdef __NR_utimes
+ case __NR_utimes:
+#endif
+ sys_data->user_mask = BIT(0);
+ break;
+ /* user arg at position 1 */
+ case __NR_execveat:
+ case __NR_faccessat:
+ case __NR_faccessat2:
+ case __NR_finit_module:
+ case __NR_fchmodat:
+ case __NR_fchmodat2:
+ case __NR_fchownat:
+ case __NR_fgetxattr:
+ case __NR_flistxattr:
+ case __NR_fsetxattr:
+ case __NR_fspick:
+ case __NR_fremovexattr:
+#ifdef __NR_futimesat
+ case __NR_futimesat:
+#endif
+ case __NR_inotify_add_watch:
+ case __NR_mkdirat:
+ case __NR_mknodat:
+ case __NR_mount_setattr:
+ case __NR_name_to_handle_at:
+#ifdef __NR_newfstatat
+ case __NR_newfstatat:
+#endif
+ case __NR_openat:
+ case __NR_openat2:
+ case __NR_open_tree:
+ case __NR_open_tree_attr:
+ case __NR_readlinkat:
+ case __NR_quotactl:
+ case __NR_syslog:
+ case __NR_statx:
+ case __NR_unlinkat:
+#ifdef __NR_utimensat
+ case __NR_utimensat:
+#endif
+ sys_data->user_mask = BIT(1);
+ break;
+ /* user arg at position 2 */
+ case __NR_init_module:
+ case __NR_fsconfig:
+ sys_data->user_mask = BIT(2);
+ break;
+ /* user arg at position 4 */
+ case __NR_fanotify_mark:
+ sys_data->user_mask = BIT(4);
+ break;
+ /* 2 user args, 0 and 1 */
+ case __NR_add_key:
+ case __NR_getxattr:
+ case __NR_lgetxattr:
+ case __NR_lremovexattr:
+#ifdef __NR_link
+ case __NR_link:
+#endif
+ case __NR_listxattr:
+ case __NR_llistxattr:
+ case __NR_lsetxattr:
+ case __NR_pivot_root:
+ case __NR_removexattr:
+#ifdef __NR_rename
+ case __NR_rename:
+#endif
+ case __NR_request_key:
+ case __NR_setxattr:
+#ifdef __NR_symlink
+ case __NR_symlink:
+#endif
+ sys_data->user_mask = BIT(0) | BIT(1);
+ break;
+ /* 2 user args, 0 and 2 */
+ case __NR_symlinkat:
+ sys_data->user_mask = BIT(0) | BIT(2);
+ break;
+ /* 2 user args, 1 and 3 */
+ case __NR_getxattrat:
+ case __NR_linkat:
+ case __NR_listxattrat:
+ case __NR_move_mount:
+#ifdef __NR_renameat
+ case __NR_renameat:
+#endif
+ case __NR_renameat2:
+ case __NR_removexattrat:
+ case __NR_setxattrat:
+ sys_data->user_mask = BIT(1) | BIT(3);
+ break;
+ case __NR_mount: /* Just dev_name and dir_name, TODO add type */
+ sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
+ break;
+ default:
+ sys_data->user_mask = 0;
+ return;
+ }
+
+ if (sys_data->user_arg_size < 0)
+ return;
+
+ /*
+ * The user_arg_size can only be used when the system call
+ * is reading only a single address from user space.
+ */
+ mask = sys_data->user_mask;
+ if (WARN_ON(mask & (mask - 1)))
+ sys_data->user_arg_size = -1;
+}
+
static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
@@ -471,6 +1274,8 @@ static int __init init_syscall_trace(struct trace_event_call *call)
return -ENOSYS;
}
+ check_faultable_syscall(call, num);
+
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
@@ -598,9 +1403,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
+ bool mayfault;
+ char *user_ptr;
+ int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
+ int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
int syscall_nr;
int rctx;
- int size;
+ int size = 0;
+ int uargs = 0;
/*
* Syscall probe called with preemption enabled, but the ring
@@ -619,13 +1429,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ syscall_get_arguments(current, regs, args);
+
+ /* Check if this syscall event faults in user space memory */
+ mayfault = sys_data->user_mask != 0;
+
+ if (mayfault) {
+ if (syscall_get_data(sys_data, args, &user_ptr,
+ &size, user_sizes, &uargs, buf_size) < 0)
+ return;
+ }
+
head = this_cpu_ptr(sys_data->enter_event->perf_events);
valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
- size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -634,9 +1455,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, args);
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
+ if (mayfault)
+ syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
+
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
@@ -651,36 +1474,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
static int perf_sysenter_enable(struct trace_event_call *call)
{
- int ret = 0;
+ struct syscall_metadata *sys_data = call->data;
int num;
+ int ret;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
- mutex_lock(&syscall_trace_lock);
- if (!sys_perf_refcount_enter)
+ guard(mutex)(&syscall_trace_lock);
+ if (sys_data->user_mask) {
+ ret = syscall_fault_buffer_enable();
+ if (ret < 0)
+ return ret;
+ }
+ if (!sys_perf_refcount_enter) {
ret = register_trace_sys_enter(perf_syscall_enter, NULL);
- if (ret) {
- pr_info("event trace: Could not activate syscall entry trace point");
- } else {
- set_bit(num, enabled_perf_enter_syscalls);
- sys_perf_refcount_enter++;
+ if (ret) {
+ pr_info("event trace: Could not activate syscall entry trace point");
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ set_bit(num, enabled_perf_enter_syscalls);
+ sys_perf_refcount_enter++;
+ return 0;
}
static void perf_sysenter_disable(struct trace_event_call *call)
{
+ struct syscall_metadata *sys_data = call->data;
int num;
- num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ num = sys_data->syscall_nr;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
unregister_trace_sys_enter(perf_syscall_enter, NULL);
- mutex_unlock(&syscall_trace_lock);
+ if (sys_data->user_mask)
+ syscall_fault_buffer_disable();
}
static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
@@ -757,22 +1590,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
static int perf_sysexit_enable(struct trace_event_call *call)
{
- int ret = 0;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- mutex_lock(&syscall_trace_lock);
- if (!sys_perf_refcount_exit)
- ret = register_trace_sys_exit(perf_syscall_exit, NULL);
- if (ret) {
- pr_info("event trace: Could not activate syscall exit trace point");
- } else {
- set_bit(num, enabled_perf_exit_syscalls);
- sys_perf_refcount_exit++;
+ guard(mutex)(&syscall_trace_lock);
+ if (!sys_perf_refcount_exit) {
+ int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate syscall exit trace point");
+ return ret;
+ }
}
- mutex_unlock(&syscall_trace_lock);
- return ret;
+ set_bit(num, enabled_perf_exit_syscalls);
+ sys_perf_refcount_exit++;
+ return 0;
}
static void perf_sysexit_disable(struct trace_event_call *call)
@@ -781,12 +1613,11 @@ static void perf_sysexit_disable(struct trace_event_call *call)
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- mutex_lock(&syscall_trace_lock);
+ guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
unregister_trace_sys_exit(perf_syscall_exit, NULL);
- mutex_unlock(&syscall_trace_lock);
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 430d09c49462..1b4f32e2b9bd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -533,21 +533,26 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
return ret;
}
+DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T))
+
/*
* Argument syntax:
* - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
static int __trace_uprobe_create(int argc, const char **argv)
{
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
+ struct trace_uprobe *tu __free(free_trace_uprobe) = NULL;
+ const char *trlog __free(trace_probe_log_clear) = NULL;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
- char *arg, *filename, *rctr, *rctr_end, *tmp;
+ struct path path __free(path_put) = {};
unsigned long offset, ref_ctr_offset;
+ char *filename __free(kfree) = NULL;
+ char *arg, *rctr, *rctr_end, *tmp;
char *gbuf __free(kfree) = NULL;
char *buf __free(kfree) = NULL;
enum probe_print_type ptype;
- struct trace_uprobe *tu;
bool is_return = false;
- struct path path;
int i, ret;
ref_ctr_offset = 0;
@@ -565,7 +570,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
- trace_probe_log_init("trace_uprobe", argc, argv);
+ trlog = trace_probe_log_init("trace_uprobe", argc, argv);
if (argc - 2 > MAX_TRACE_ARGS) {
trace_probe_log_set_index(2);
@@ -585,10 +590,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* Find the last occurrence, in case the path contains ':' too. */
arg = strrchr(filename, ':');
- if (!arg || !isdigit(arg[1])) {
- kfree(filename);
+ if (!arg || !isdigit(arg[1]))
return -ECANCELED;
- }
trace_probe_log_set_index(1); /* filename is the 2nd argument */
@@ -596,14 +599,11 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret) {
trace_probe_log_err(0, FILE_NOT_FOUND);
- kfree(filename);
- trace_probe_log_clear();
return ret;
}
if (!d_is_reg(path.dentry)) {
trace_probe_log_err(0, NO_REGULAR_FILE);
- ret = -EINVAL;
- goto fail_address_parse;
+ return -EINVAL;
}
/* Parse reference counter offset if specified. */
@@ -611,16 +611,14 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (rctr) {
rctr_end = strchr(rctr, ')');
if (!rctr_end) {
- ret = -EINVAL;
rctr_end = rctr + strlen(rctr);
trace_probe_log_err(rctr_end - filename,
REFCNT_OPEN_BRACE);
- goto fail_address_parse;
+ return -EINVAL;
} else if (rctr_end[1] != '\0') {
- ret = -EINVAL;
trace_probe_log_err(rctr_end + 1 - filename,
BAD_REFCNT_SUFFIX);
- goto fail_address_parse;
+ return -EINVAL;
}
*rctr++ = '\0';
@@ -628,7 +626,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = kstrtoul(rctr, 0, &ref_ctr_offset);
if (ret) {
trace_probe_log_err(rctr - filename, BAD_REFCNT);
- goto fail_address_parse;
+ return ret;
}
}
@@ -640,8 +638,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
is_return = true;
} else {
trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX);
- ret = -EINVAL;
- goto fail_address_parse;
+ return -EINVAL;
}
}
@@ -649,7 +646,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = kstrtoul(arg, 0, &offset);
if (ret) {
trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
- goto fail_address_parse;
+ return ret;
}
/* setup a probe */
@@ -657,12 +654,12 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (event) {
gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
if (!gbuf)
- goto fail_mem;
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto fail_address_parse;
+ return ret;
}
if (!event) {
@@ -671,7 +668,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
tail = kstrdup(kbasename(filename), GFP_KERNEL);
if (!tail)
- goto fail_mem;
+ return -ENOMEM;
ptr = strpbrk(tail, ".-_");
if (ptr)
@@ -679,7 +676,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
if (!buf)
- goto fail_mem;
+ return -ENOMEM;
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
event = buf;
kfree(tail);
@@ -693,51 +690,36 @@ static int __trace_uprobe_create(int argc, const char **argv)
ret = PTR_ERR(tu);
/* This must return -ENOMEM otherwise there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto fail_address_parse;
+ return ret;
}
tu->offset = offset;
tu->ref_ctr_offset = ref_ctr_offset;
tu->path = path;
- tu->filename = filename;
+ /* Clear @path so that it will not freed by path_put() */
+ memset(&path, 0, sizeof(path));
+ tu->filename = no_free_ptr(filename);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER;
/* parse arguments */
for (i = 0; i < argc; i++) {
- struct traceprobe_parse_context *ctx __free(traceprobe_parse_context)
- = kzalloc(sizeof(*ctx), GFP_KERNEL);
-
- if (!ctx) {
- ret = -ENOMEM;
- goto error;
- }
- ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER;
trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx);
if (ret)
- goto error;
+ return ret;
}
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tu->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_uprobe(tu);
if (!ret)
- goto out;
-
-error:
- free_trace_uprobe(tu);
-out:
- trace_probe_log_clear();
- return ret;
-
-fail_mem:
- ret = -ENOMEM;
-
-fail_address_parse:
- trace_probe_log_clear();
- path_put(&path);
- kfree(filename);
+ tu = NULL;
return ret;
}
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f8..fe9bf8db1922 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
+struct hwerr_info {
+ atomic_t count;
+ time64_t timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+ if (src < 0 || src >= HWERR_RECOV_MAX)
+ return;
+
+ atomic_inc(&hwerr_data[src].count);
+ WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5b62d1002783..0685e3a8aa0a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+#include <linux/sys_info.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
@@ -65,6 +66,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
unsigned int __read_mostly hardlockup_panic =
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
+/*
+ * bitmasks to control what kinds of system info to be printed when
+ * hard lockup is detected, it could be task, memory, lock etc.
+ * Refer include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long hardlockup_si_mask;
+
#ifdef CONFIG_SYSFS
static unsigned int hardlockup_count;
@@ -178,11 +186,15 @@ static void watchdog_hardlockup_kick(void)
void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
+ int hardlockup_all_cpu_backtrace;
+
if (per_cpu(watchdog_hardlockup_touched, cpu)) {
per_cpu(watchdog_hardlockup_touched, cpu) = false;
return;
}
+ hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ?
+ 1 : sysctl_hardlockup_all_cpu_backtrace;
/*
* Check for a hardlockup by making sure the CPU's timer
* interrupt is incrementing. The timer interrupt should have
@@ -196,6 +208,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
#ifdef CONFIG_SYSFS
++hardlockup_count;
#endif
+ /*
+ * A poorly behaving BPF scheduler can trigger hard lockup by
+ * e.g. putting numerous affinitized tasks in a single queue and
+ * directing all CPUs at it. The following call can return true
+ * only once when sched_ext is enabled and will immediately
+ * abort the BPF scheduler and print out a warning message.
+ */
+ if (scx_hardlockup(cpu))
+ return;
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
@@ -205,7 +226,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
* Prevent multiple hard-lockup reports if one cpu is already
* engaged in dumping all cpu back traces.
*/
- if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (hardlockup_all_cpu_backtrace) {
if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
return;
}
@@ -234,12 +255,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
trigger_single_cpu_backtrace(cpu);
}
- if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (hardlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(cpu);
if (!hardlockup_panic)
clear_bit_unlock(0, &hard_lockup_nmi_warn);
}
+ sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT);
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
@@ -330,6 +352,13 @@ static void lockup_detector_update_enable(void)
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif
+/*
+ * bitmasks to control what kinds of system info to be printed when
+ * soft lockup is detected, it could be task, memory, lock etc.
+ * Refer include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long softlockup_si_mask;
+
static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
@@ -746,7 +775,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
int duration;
- int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ int softlockup_all_cpu_backtrace;
unsigned long flags;
if (!watchdog_enabled)
@@ -758,6 +787,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (panic_in_progress())
return HRTIMER_NORESTART;
+ softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ?
+ 1 : sysctl_softlockup_all_cpu_backtrace;
+
watchdog_hardlockup_kick();
/* kick the softlockup detector */
@@ -846,6 +878,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
+ sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
if (softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1197,6 +1230,13 @@ static const struct ctl_table watchdog_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "softlockup_sys_info",
+ .data = &softlockup_si_mask,
+ .maxlen = sizeof(softlockup_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
@@ -1219,6 +1259,13 @@ static const struct ctl_table watchdog_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "hardlockup_sys_info",
+ .data = &hardlockup_si_mask,
+ .maxlen = sizeof(hardlockup_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
#ifdef CONFIG_SMP
{
.procname = "hardlockup_all_cpu_backtrace",
@@ -1231,14 +1278,11 @@ static const struct ctl_table watchdog_sysctls[] = {
},
#endif /* CONFIG_SMP */
#endif
-};
-
-static struct ctl_table watchdog_hardlockup_sysctl[] = {
{
.procname = "nmi_watchdog",
.data = &watchdog_hardlockup_user_enabled,
.maxlen = sizeof(int),
- .mode = 0444,
+ .mode = 0644,
.proc_handler = proc_nmi_watchdog,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1248,10 +1292,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = {
static void __init watchdog_sysctl_init(void)
{
register_sysctl_init("kernel", watchdog_sysctls);
-
- if (watchdog_hardlockup_available)
- watchdog_hardlockup_sysctl[0].mode = 0644;
- register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
}
#else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 45320e27a16c..253311af47c6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -541,12 +541,6 @@ static void show_one_worker_pool(struct worker_pool *pool);
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
-#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
- !lockdep_is_held(&wq->mutex) && \
- !lockdep_is_held(&wq_pool_mutex), \
- "RCU, wq->mutex or wq_pool_mutex should be held")
-
#define for_each_bh_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \
(pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -3443,6 +3437,27 @@ sleep:
goto woke_up;
}
+static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work, *n;
+
+ /* need rescue? */
+ if (!pwq->nr_active || !need_to_create_worker(pool))
+ return false;
+
+ /*
+ * Slurp in all works issued via this workqueue and
+ * process'em.
+ */
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+ pwq->stats[PWQ_STAT_RESCUED]++;
+ }
+
+ return !list_empty(&rescuer->scheduled);
+}
+
/**
* rescuer_thread - the rescuer thread function
* @__rescuer: self
@@ -3497,7 +3512,6 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
- struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3508,18 +3522,9 @@ repeat:
raw_spin_lock_irq(&pool->lock);
- /*
- * Slurp in all works issued via this workqueue and
- * process'em.
- */
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq &&
- assign_work(work, rescuer, &n))
- pwq->stats[PWQ_STAT_RESCUED]++;
- }
- if (!list_empty(&rescuer->scheduled)) {
+ if (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
@@ -3534,10 +3539,9 @@ repeat:
if (pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
/*
- * Queue iff we aren't racing destruction
- * and somebody else hasn't queued it already.
+ * Queue iff somebody else hasn't queued it already.
*/
- if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ if (list_empty(&pwq->mayday_node)) {
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
@@ -5376,11 +5380,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
/* update node_nr_active->max */
wq_update_node_max_active(ctx->wq, -1);
- /* rescuer needs to respect wq cpumask changes */
- if (ctx->wq->rescuer)
- set_cpus_allowed_ptr(ctx->wq->rescuer->task,
- unbound_effective_cpumask(ctx->wq));
-
mutex_unlock(&ctx->wq->mutex);
}
@@ -5614,10 +5613,13 @@ static int init_rescuer(struct workqueue_struct *wq)
}
wq->rescuer = rescuer;
- if (wq->flags & WQ_UNBOUND)
- kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
+
+ /* initial cpumask is consistent with the detached rescuer and unbind_worker() */
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
else
kthread_bind_mask(rescuer->task, cpu_possible_mask);
+
wake_up_process(rescuer->task);
return 0;
@@ -5902,16 +5904,10 @@ void destroy_workqueue(struct workqueue_struct *wq)
/* kill rescuer, if sanity checks fail, leave it w/o rescuer */
if (wq->rescuer) {
- struct worker *rescuer = wq->rescuer;
-
- /* this prevents new queueing */
- raw_spin_lock_irq(&wq_mayday_lock);
- wq->rescuer = NULL;
- raw_spin_unlock_irq(&wq_mayday_lock);
-
/* rescuer will empty maydays list before exiting */
- kthread_stop(rescuer->task);
- kfree(rescuer);
+ kthread_stop(wq->rescuer->task);
+ kfree(wq->rescuer);
+ wq->rescuer = NULL;
}
/*
@@ -6937,8 +6933,26 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
if (!ret) {
+ int cpu;
+ struct worker_pool *pool;
+ struct worker *worker;
+
mutex_lock(&wq_pool_attach_mutex);
cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ /* rescuer needs to respect cpumask changes when it is not attached */
+ list_for_each_entry(wq, &workqueues, list) {
+ if (wq->rescuer && !wq->rescuer->pool)
+ unbind_worker(wq->rescuer);
+ }
+ /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */
+ for_each_possible_cpu(cpu) {
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (!(pool->flags & POOL_DISASSOCIATED))
+ continue;
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
+ }
+ }
mutex_unlock(&wq_pool_attach_mutex);
}
return ret;